From 8a849a2a567d4e519b246a16936b6e7519936d4b Mon Sep 17 00:00:00 2001
From: Mikhail Goncharov <goncharov.mikhail@gmail.com>
Date: Thu, 10 Oct 2024 13:37:44 +0200
Subject: [PATCH 001/177] Revert "Reapply "[AMDGPU][GlobalISel] Fix load/store
 of pointer vectors, buffer.*.pN (#110714)" v2 (#111708)"

This reverts commit 4b4a0d419c81b8b12a7dbb33dae1f7e9be91a88f.

New test fails on buildbots https://lab.llvm.org/buildbot/#/builders/63/builds/2039 https://lab.llvm.org/buildbot/#/builders/127/builds/1055
---
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |   61 +-
 llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h  |   12 +-
 llvm/lib/Target/AMDGPU/SIRegisterInfo.td      |    2 +-
 .../GlobalISel/buffer-load-store-pointers.ll  |  301 --
 ...st-select-load-global-old-legalization.mir | 3300 -----------------
 .../GlobalISel/inst-select-load-local.mir     |   96 +-
 .../GlobalISel/legalize-load-constant.mir     |   51 +-
 .../AMDGPU/GlobalISel/legalize-load-flat.mir  |  152 +-
 .../GlobalISel/legalize-load-global.mir       |   98 +-
 .../AMDGPU/GlobalISel/legalize-load-local.mir |   50 +-
 .../GlobalISel/legalize-load-private.mir      |   83 +-
 .../GlobalISel/legalize-store-global.mir      |   84 +-
 12 files changed, 275 insertions(+), 4015 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-load-store-pointers.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-old-legalization.mir

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 4409a0d50e553e..b35f9faf024bdb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -494,8 +494,6 @@ static bool loadStoreBitcastWorkaround(const LLT Ty) {
     return false;
 
   const unsigned Size = Ty.getSizeInBits();
-  if (Ty.isPointerVector())
-    return true;
   if (Size <= 64)
     return false;
   // Address space 8 pointers get their own workaround.
@@ -504,6 +502,9 @@ static bool loadStoreBitcastWorkaround(const LLT Ty) {
   if (!Ty.isVector())
     return true;
 
+  if (Ty.isPointerVector())
+    return true;
+
   unsigned EltSize = Ty.getScalarSizeInBits();
   return EltSize != 32 && EltSize != 64;
 }
@@ -5817,9 +5818,8 @@ Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
   return Reg;
 }
 
-Register AMDGPULegalizerInfo::fixStoreSourceType(MachineIRBuilder &B,
-                                                 Register VData, LLT MemTy,
-                                                 bool IsFormat) const {
+Register AMDGPULegalizerInfo::fixStoreSourceType(
+  MachineIRBuilder &B, Register VData, bool IsFormat) const {
   MachineRegisterInfo *MRI = B.getMRI();
   LLT Ty = MRI->getType(VData);
 
@@ -5829,10 +5829,6 @@ Register AMDGPULegalizerInfo::fixStoreSourceType(MachineIRBuilder &B,
   if (hasBufferRsrcWorkaround(Ty))
     return castBufferRsrcToV4I32(VData, B);
 
-  if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
-    Ty = getBitcastRegisterType(Ty);
-    VData = B.buildBitcast(Ty, VData).getReg(0);
-  }
   // Fixup illegal register types for i8 stores.
   if (Ty == LLT::scalar(8) || Ty == S16) {
     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
@@ -5850,27 +5846,23 @@ Register AMDGPULegalizerInfo::fixStoreSourceType(MachineIRBuilder &B,
 }
 
 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
-                                              LegalizerHelper &Helper,
+                                              MachineRegisterInfo &MRI,
+                                              MachineIRBuilder &B,
                                               bool IsTyped,
                                               bool IsFormat) const {
-  MachineIRBuilder &B = Helper.MIRBuilder;
-  MachineRegisterInfo &MRI = *B.getMRI();
-
   Register VData = MI.getOperand(1).getReg();
   LLT Ty = MRI.getType(VData);
   LLT EltTy = Ty.getScalarType();
   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
   const LLT S32 = LLT::scalar(32);
 
-  MachineMemOperand *MMO = *MI.memoperands_begin();
-  const int MemSize = MMO->getSize().getValue();
-  LLT MemTy = MMO->getMemoryType();
-
-  VData = fixStoreSourceType(B, VData, MemTy, IsFormat);
-
+  VData = fixStoreSourceType(B, VData, IsFormat);
   castBufferRsrcArgToV4I32(MI, B, 2);
   Register RSrc = MI.getOperand(2).getReg();
 
+  MachineMemOperand *MMO = *MI.memoperands_begin();
+  const int MemSize = MMO->getSize().getValue();
+
   unsigned ImmOffset;
 
   // The typed intrinsics add an immediate after the registers.
@@ -5962,13 +5954,10 @@ static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
 }
 
 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
-                                             LegalizerHelper &Helper,
+                                             MachineRegisterInfo &MRI,
+                                             MachineIRBuilder &B,
                                              bool IsFormat,
                                              bool IsTyped) const {
-  MachineIRBuilder &B = Helper.MIRBuilder;
-  MachineRegisterInfo &MRI = *B.getMRI();
-  GISelChangeObserver &Observer = Helper.Observer;
-
   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
   MachineMemOperand *MMO = *MI.memoperands_begin();
   const LLT MemTy = MMO->getMemoryType();
@@ -6017,21 +6006,9 @@ bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
   // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
   // logic doesn't have to handle that case.
   if (hasBufferRsrcWorkaround(Ty)) {
-    Observer.changingInstr(MI);
     Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
-    Observer.changedInstr(MI);
     Dst = MI.getOperand(0).getReg();
-    B.setInsertPt(B.getMBB(), MI);
   }
-  if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
-    Ty = getBitcastRegisterType(Ty);
-    Observer.changingInstr(MI);
-    Helper.bitcastDst(MI, Ty, 0);
-    Observer.changedInstr(MI);
-    Dst = MI.getOperand(0).getReg();
-    B.setInsertPt(B.getMBB(), MI);
-  }
-
   LLT EltTy = Ty.getScalarType();
   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
   const bool Unpacked = ST.hasUnpackedD16VMem();
@@ -7411,17 +7388,17 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
   case Intrinsic::amdgcn_raw_ptr_buffer_store:
   case Intrinsic::amdgcn_struct_buffer_store:
   case Intrinsic::amdgcn_struct_ptr_buffer_store:
-    return legalizeBufferStore(MI, Helper, false, false);
+    return legalizeBufferStore(MI, MRI, B, false, false);
   case Intrinsic::amdgcn_raw_buffer_store_format:
   case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
   case Intrinsic::amdgcn_struct_buffer_store_format:
   case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
-    return legalizeBufferStore(MI, Helper, false, true);
+    return legalizeBufferStore(MI, MRI, B, false, true);
   case Intrinsic::amdgcn_raw_tbuffer_store:
   case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
   case Intrinsic::amdgcn_struct_tbuffer_store:
   case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
-    return legalizeBufferStore(MI, Helper, true, true);
+    return legalizeBufferStore(MI, MRI, B, true, true);
   case Intrinsic::amdgcn_raw_buffer_load:
   case Intrinsic::amdgcn_raw_ptr_buffer_load:
   case Intrinsic::amdgcn_raw_atomic_buffer_load:
@@ -7430,17 +7407,17 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
   case Intrinsic::amdgcn_struct_ptr_buffer_load:
   case Intrinsic::amdgcn_struct_atomic_buffer_load:
   case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
-    return legalizeBufferLoad(MI, Helper, false, false);
+    return legalizeBufferLoad(MI, MRI, B, false, false);
   case Intrinsic::amdgcn_raw_buffer_load_format:
   case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
   case Intrinsic::amdgcn_struct_buffer_load_format:
   case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
-    return legalizeBufferLoad(MI, Helper, true, false);
+    return legalizeBufferLoad(MI, MRI, B, true, false);
   case Intrinsic::amdgcn_raw_tbuffer_load:
   case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
   case Intrinsic::amdgcn_struct_tbuffer_load:
   case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
-    return legalizeBufferLoad(MI, Helper, true, true);
+    return legalizeBufferLoad(MI, MRI, B, true, true);
   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 86c15197805d23..84470dc75b60ef 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -195,13 +195,15 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
 
   Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI,
                           Register Reg, bool ImageStore = false) const;
-  Register fixStoreSourceType(MachineIRBuilder &B, Register VData, LLT MemTy,
+  Register fixStoreSourceType(MachineIRBuilder &B, Register VData,
                               bool IsFormat) const;
 
-  bool legalizeBufferStore(MachineInstr &MI, LegalizerHelper &Helper,
-                           bool IsTyped, bool IsFormat) const;
-  bool legalizeBufferLoad(MachineInstr &MI, LegalizerHelper &Helper,
-                          bool IsFormat, bool IsTyped) const;
+  bool legalizeBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI,
+                           MachineIRBuilder &B, bool IsTyped,
+                           bool IsFormat) const;
+  bool legalizeBufferLoad(MachineInstr &MI, MachineRegisterInfo &MRI,
+                          MachineIRBuilder &B, bool IsFormat,
+                          bool IsTyped) const;
   bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B,
                             Intrinsic::ID IID) const;
 
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 902feacede83f4..ef9adde13348fe 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -590,7 +590,7 @@ class RegisterTypes<list<ValueType> reg_types> {
 
 def Reg16Types : RegisterTypes<[i16, f16, bf16]>;
 def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, v2bf16, p2, p3, p5, p6]>;
-def Reg64Types : RegisterTypes<[i64, f64, v2i32, v2f32, p0, p1, p4, v4i16, v4f16, v4bf16]>;
+def Reg64Types : RegisterTypes<[i64, f64, v2i32, v2f32, p0, v4i16, v4f16, v4bf16]>;
 def Reg96Types : RegisterTypes<[v3i32, v3f32]>;
 def Reg128Types : RegisterTypes<[v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16]>;
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-load-store-pointers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-load-store-pointers.ll
deleted file mode 100644
index 091c9f143ce7ee..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-load-store-pointers.ll
+++ /dev/null
@@ -1,301 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck --check-prefix=GFX9 %s
-
-define ptr @buffer_load_p0(ptr addrspace(8) inreg %buf) {
-  ; GFX9-LABEL: name: buffer_load_p0
-  ; GFX9: bb.1 (%ir-block.0):
-  ; GFX9-NEXT:   liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17
-  ; GFX9-NEXT: {{  $}}
-  ; GFX9-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr7
-  ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr16
-  ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr17
-  ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX9-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; GFX9-NEXT:   [[BUFFER_LOAD_DWORDX2_OFFSET:%[0-9]+]]:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s64) from %ir.buf, align 1, addrspace 8)
-  ; GFX9-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFSET]].sub0
-  ; GFX9-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFSET]].sub1
-  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY4]]
-  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY5]]
-  ; GFX9-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
-  %ret = call ptr @llvm.amdgcn.raw.ptr.buffer.load.p0(ptr addrspace(8) inreg %buf, i32 0, i32 0, i32 0)
-  ret ptr %ret
-}
-
-define void @buffer_store_p0(ptr %data, ptr addrspace(8) inreg %buf) {
-  ; GFX9-LABEL: name: buffer_store_p0
-  ; GFX9: bb.1 (%ir-block.0):
-  ; GFX9-NEXT:   liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17, $vgpr0, $vgpr1
-  ; GFX9-NEXT: {{  $}}
-  ; GFX9-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX9-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr7
-  ; GFX9-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr16
-  ; GFX9-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr17
-  ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX9-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
-  ; GFX9-NEXT:   BUFFER_STORE_DWORDX2_OFFSET_exact [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.buf, align 1, addrspace 8)
-  ; GFX9-NEXT:   SI_RETURN
-  call void @llvm.amdgcn.raw.ptr.buffer.store.p0(ptr %data, ptr addrspace(8) inreg %buf, i32 0, i32 0, i32 0)
-  ret void
-}
-
-define ptr addrspace(1) @buffer_load_p1(ptr addrspace(8) inreg %buf) {
-  ; GFX9-LABEL: name: buffer_load_p1
-  ; GFX9: bb.1 (%ir-block.0):
-  ; GFX9-NEXT:   liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17
-  ; GFX9-NEXT: {{  $}}
-  ; GFX9-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr7
-  ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr16
-  ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr17
-  ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX9-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; GFX9-NEXT:   [[BUFFER_LOAD_DWORDX2_OFFSET:%[0-9]+]]:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s64) from %ir.buf, align 1, addrspace 8)
-  ; GFX9-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFSET]].sub0
-  ; GFX9-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFSET]].sub1
-  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY4]]
-  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY5]]
-  ; GFX9-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
-  %ret = call ptr addrspace(1) @llvm.amdgcn.raw.ptr.buffer.load.p1(ptr addrspace(8) inreg %buf, i32 0, i32 0, i32 0)
-  ret ptr addrspace(1) %ret
-}
-
-define void @buffer_store_p1(ptr addrspace(1) %data, ptr addrspace(8) inreg %buf) {
-  ; GFX9-LABEL: name: buffer_store_p1
-  ; GFX9: bb.1 (%ir-block.0):
-  ; GFX9-NEXT:   liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17, $vgpr0, $vgpr1
-  ; GFX9-NEXT: {{  $}}
-  ; GFX9-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX9-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr7
-  ; GFX9-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr16
-  ; GFX9-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr17
-  ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX9-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
-  ; GFX9-NEXT:   BUFFER_STORE_DWORDX2_OFFSET_exact [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.buf, align 1, addrspace 8)
-  ; GFX9-NEXT:   SI_RETURN
-  call void @llvm.amdgcn.raw.ptr.buffer.store.p1(ptr addrspace(1) %data, ptr addrspace(8) inreg %buf, i32 0, i32 0, i32 0)
-  ret void
-}
-
-define ptr addrspace(4) @buffer_load_p4(ptr addrspace(8) inreg %buf) {
-  ; GFX9-LABEL: name: buffer_load_p4
-  ; GFX9: bb.1 (%ir-block.0):
-  ; GFX9-NEXT:   liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17
-  ; GFX9-NEXT: {{  $}}
-  ; GFX9-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr7
-  ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr16
-  ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr17
-  ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX9-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; GFX9-NEXT:   [[BUFFER_LOAD_DWORDX2_OFFSET:%[0-9]+]]:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s64) from %ir.buf, align 1, addrspace 8)
-  ; GFX9-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFSET]].sub0
-  ; GFX9-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFSET]].sub1
-  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY4]]
-  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY5]]
-  ; GFX9-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
-  %ret = call ptr addrspace(4) @llvm.amdgcn.raw.ptr.buffer.load.p4(ptr addrspace(8) inreg %buf, i32 0, i32 0, i32 0)
-  ret ptr addrspace(4) %ret
-}
-
-define void @buffer_store_p4(ptr addrspace(4) %data, ptr addrspace(8) inreg %buf) {
-  ; GFX9-LABEL: name: buffer_store_p4
-  ; GFX9: bb.1 (%ir-block.0):
-  ; GFX9-NEXT:   liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17, $vgpr0, $vgpr1
-  ; GFX9-NEXT: {{  $}}
-  ; GFX9-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX9-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr7
-  ; GFX9-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr16
-  ; GFX9-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr17
-  ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX9-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
-  ; GFX9-NEXT:   BUFFER_STORE_DWORDX2_OFFSET_exact [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s64) into %ir.buf, align 1, addrspace 8)
-  ; GFX9-NEXT:   SI_RETURN
-  call void @llvm.amdgcn.raw.ptr.buffer.store.p4(ptr addrspace(4) %data, ptr addrspace(8) inreg %buf, i32 0, i32 0, i32 0)
-  ret void
-}
-
-define ptr addrspace(5) @buffer_load_p5(ptr addrspace(8) inreg %buf) {
-  ; GFX9-LABEL: name: buffer_load_p5
-  ; GFX9: bb.1 (%ir-block.0):
-  ; GFX9-NEXT:   liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17
-  ; GFX9-NEXT: {{  $}}
-  ; GFX9-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr7
-  ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr16
-  ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr17
-  ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX9-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; GFX9-NEXT:   [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.buf, align 1, addrspace 8)
-  ; GFX9-NEXT:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]]
-  ; GFX9-NEXT:   SI_RETURN implicit $vgpr0
-  %ret = call ptr addrspace(5) @llvm.amdgcn.raw.ptr.buffer.load.p5(ptr addrspace(8) inreg %buf, i32 0, i32 0, i32 0)
-  ret ptr addrspace(5) %ret
-}
-
-define void @buffer_store_p5(ptr addrspace(5) %data, ptr addrspace(8) inreg %buf) {
-  ; GFX9-LABEL: name: buffer_store_p5
-  ; GFX9: bb.1 (%ir-block.0):
-  ; GFX9-NEXT:   liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17, $vgpr0
-  ; GFX9-NEXT: {{  $}}
-  ; GFX9-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr7
-  ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr16
-  ; GFX9-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr17
-  ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX9-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
-  ; GFX9-NEXT:   BUFFER_STORE_DWORD_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32) into %ir.buf, align 1, addrspace 8)
-  ; GFX9-NEXT:   SI_RETURN
-  call void @llvm.amdgcn.raw.ptr.buffer.store.p5(ptr addrspace(5) %data, ptr addrspace(8) inreg %buf, i32 0, i32 0, i32 0)
-  ret void
-}
-
-define <2 x ptr addrspace(1)> @buffer_load_v2p1(ptr addrspace(8) inreg %buf) {
-  ; GFX9-LABEL: name: buffer_load_v2p1
-  ; GFX9: bb.1 (%ir-block.0):
-  ; GFX9-NEXT:   liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17
-  ; GFX9-NEXT: {{  $}}
-  ; GFX9-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr7
-  ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr16
-  ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr17
-  ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX9-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; GFX9-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (<2 x s64>) from %ir.buf, align 1, addrspace 8)
-  ; GFX9-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFSET]].sub0
-  ; GFX9-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFSET]].sub1
-  ; GFX9-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFSET]].sub2
-  ; GFX9-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFSET]].sub3
-  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY4]]
-  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY5]]
-  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY6]]
-  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY7]]
-  ; GFX9-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
-  %ret = call <2 x ptr addrspace(1)> @llvm.amdgcn.raw.ptr.buffer.load.v2p1(ptr addrspace(8) inreg %buf, i32 0, i32 0, i32 0)
-  ret <2 x ptr addrspace(1)> %ret
-}
-
-define void @buffer_store_v2p5(<2 x ptr addrspace(1)> %data, ptr addrspace(8) inreg %buf) {
-  ; GFX9-LABEL: name: buffer_store_v2p5
-  ; GFX9: bb.1 (%ir-block.0):
-  ; GFX9-NEXT:   liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-  ; GFX9-NEXT: {{  $}}
-  ; GFX9-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; GFX9-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; GFX9-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; GFX9-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
-  ; GFX9-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX9-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr7
-  ; GFX9-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr16
-  ; GFX9-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr17
-  ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX9-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
-  ; GFX9-NEXT:   BUFFER_STORE_DWORDX4_OFFSET_exact [[REG_SEQUENCE2]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s64>) into %ir.buf, align 1, addrspace 8)
-  ; GFX9-NEXT:   SI_RETURN
-  call void @llvm.amdgcn.raw.ptr.buffer.store.v2p1(<2 x ptr addrspace(1)> %data, ptr addrspace(8) inreg %buf, i32 0, i32 0, i32 0)
-  ret void
-}
-
-define <3 x ptr addrspace(5)> @buffer_load_v3p5(ptr addrspace(8) inreg %buf) {
-  ; GFX9-LABEL: name: buffer_load_v3p5
-  ; GFX9: bb.1 (%ir-block.0):
-  ; GFX9-NEXT:   liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17
-  ; GFX9-NEXT: {{  $}}
-  ; GFX9-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr7
-  ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr16
-  ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr17
-  ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX9-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; GFX9-NEXT:   [[BUFFER_LOAD_DWORDX3_OFFSET:%[0-9]+]]:vreg_96_align2 = BUFFER_LOAD_DWORDX3_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (<3 x s32>) from %ir.buf, align 1, addrspace 8)
-  ; GFX9-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFSET]].sub0
-  ; GFX9-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFSET]].sub1
-  ; GFX9-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFSET]].sub2
-  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY4]]
-  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY5]]
-  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY6]]
-  ; GFX9-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
-  %ret = call <3 x ptr addrspace(5)> @llvm.amdgcn.raw.ptr.buffer.load.v3p5(ptr addrspace(8) inreg %buf, i32 0, i32 0, i32 0)
-  ret <3 x ptr addrspace(5)> %ret
-}
-
-define void @buffer_store_v3p5(<3 x ptr addrspace(5)> %data, ptr addrspace(8) inreg %buf) {
-  ; GFX9-LABEL: name: buffer_store_v3p5
-  ; GFX9: bb.1 (%ir-block.0):
-  ; GFX9-NEXT:   liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17, $vgpr0, $vgpr1, $vgpr2
-  ; GFX9-NEXT: {{  $}}
-  ; GFX9-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; GFX9-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2
-  ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX9-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr7
-  ; GFX9-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr16
-  ; GFX9-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr17
-  ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX9-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY6]], %subreg.sub3
-  ; GFX9-NEXT:   BUFFER_STORE_DWORDX3_OFFSET_exact [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (<3 x s32>) into %ir.buf, align 1, addrspace 8)
-  ; GFX9-NEXT:   SI_RETURN
-  call void @llvm.amdgcn.raw.ptr.buffer.store.v3p5(<3 x ptr addrspace(5)> %data, ptr addrspace(8) inreg %buf, i32 0, i32 0, i32 0)
-  ret void
-}
-
-define <4 x ptr addrspace(5)> @buffer_load_v4p5(ptr addrspace(8) inreg %buf) {
-  ; GFX9-LABEL: name: buffer_load_v4p5
-  ; GFX9: bb.1 (%ir-block.0):
-  ; GFX9-NEXT:   liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17
-  ; GFX9-NEXT: {{  $}}
-  ; GFX9-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr7
-  ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr16
-  ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr17
-  ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX9-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; GFX9-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (<4 x s32>) from %ir.buf, align 1, addrspace 8)
-  ; GFX9-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFSET]].sub0
-  ; GFX9-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFSET]].sub1
-  ; GFX9-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFSET]].sub2
-  ; GFX9-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFSET]].sub3
-  ; GFX9-NEXT:   $vgpr0 = COPY [[COPY4]]
-  ; GFX9-NEXT:   $vgpr1 = COPY [[COPY5]]
-  ; GFX9-NEXT:   $vgpr2 = COPY [[COPY6]]
-  ; GFX9-NEXT:   $vgpr3 = COPY [[COPY7]]
-  ; GFX9-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
-  %ret = call <4 x ptr addrspace(5)> @llvm.amdgcn.raw.ptr.buffer.load.v4p5(ptr addrspace(8) inreg %buf, i32 0, i32 0, i32 0)
-  ret <4 x ptr addrspace(5)> %ret
-}
-
-define void @buffer_store_v4p5(<4 x ptr addrspace(5)> %data, ptr addrspace(8) inreg %buf) {
-  ; GFX9-LABEL: name: buffer_store_v4p5
-  ; GFX9: bb.1 (%ir-block.0):
-  ; GFX9-NEXT:   liveins: $sgpr6, $sgpr7, $sgpr16, $sgpr17, $vgpr0, $vgpr1, $vgpr2, $vgpr3
-  ; GFX9-NEXT: {{  $}}
-  ; GFX9-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; GFX9-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; GFX9-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX9-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr7
-  ; GFX9-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr16
-  ; GFX9-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr17
-  ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GFX9-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
-  ; GFX9-NEXT:   BUFFER_STORE_DWORDX4_OFFSET_exact [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (<4 x s32>) into %ir.buf, align 1, addrspace 8)
-  ; GFX9-NEXT:   SI_RETURN
-  call void @llvm.amdgcn.raw.ptr.buffer.store.v4p5(<4 x ptr addrspace(5)> %data, ptr addrspace(8) inreg %buf, i32 0, i32 0, i32 0)
-  ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-old-legalization.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-old-legalization.mir
deleted file mode 100644
index a7e3a86024201b..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-old-legalization.mir
+++ /dev/null
@@ -1,3300 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs  -global-isel-abort=0 -disable-gisel-legality-check -o - %s | FileCheck -check-prefix=GFX6 %s
-# RUN: llc -mtriple=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs  -global-isel-abort=0 -disable-gisel-legality-check -o - %s | FileCheck -check-prefix=GFX7 %s
-# RUN: llc -mtriple=amdgcn -mcpu=hawaii -mattr=+flat-for-global -run-pass=instruction-select -verify-machineinstrs  -global-isel-abort=0 -disable-gisel-legality-check -o - %s | FileCheck -check-prefix=GFX7-FLAT %s
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX8 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX11 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX12 %s
-
----
-
-name: load_global_s32_from_4
-
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-
-
-body: |
-  bb.0:
-    liveins:  $vgpr0_vgpr1
-
-    ; GFX6-LABEL: name: load_global_s32_from_4
-    ; GFX6: liveins: $vgpr0_vgpr1
-    ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 1)
-    ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
-    ;
-    ; GFX7-LABEL: name: load_global_s32_from_4
-    ; GFX7: liveins: $vgpr0_vgpr1
-    ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 1)
-    ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
-    ;
-    ; GFX7-FLAT-LABEL: name: load_global_s32_from_4
-    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: {{  $}}
-    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 1)
-    ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
-    ;
-    ; GFX8-LABEL: name: load_global_s32_from_4
-    ; GFX8: liveins: $vgpr0_vgpr1
-    ; GFX8-NEXT: {{  $}}
-    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 1)
-    ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
-    ;
-    ; GFX9-LABEL: name: load_global_s32_from_4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
-    ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
-    ;
-    ; GFX10-LABEL: name: load_global_s32_from_4
-    ; GFX10: liveins: $vgpr0_vgpr1
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
-    ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
-    ;
-    ; GFX11-LABEL: name: load_global_s32_from_4
-    ; GFX11: liveins: $vgpr0_vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
-    ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
-    ;
-    ; GFX12-LABEL: name: load_global_s32_from_4
-    ; GFX12: liveins: $vgpr0_vgpr1
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
-    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
-    %0:vgpr(p1) = COPY $vgpr0_vgpr1
-    %1:vgpr(s32) = G_LOAD %0 :: (load (s32), align 4, addrspace 1)
-    $vgpr0 = COPY %1
-
-...
-
----
-
-name: load_global_s32_from_2
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-
-body: |
-  bb.0:
-    liveins:  $vgpr0_vgpr1
-
-    ; GFX6-LABEL: name: load_global_s32_from_2
-    ; GFX6: liveins: $vgpr0_vgpr1
-    ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[BUFFER_LOAD_USHORT_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (s16), addrspace 1)
-    ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_ADDR64_]]
-    ;
-    ; GFX7-LABEL: name: load_global_s32_from_2
-    ; GFX7: liveins: $vgpr0_vgpr1
-    ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[BUFFER_LOAD_USHORT_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (s16), addrspace 1)
-    ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_ADDR64_]]
-    ;
-    ; GFX7-FLAT-LABEL: name: load_global_s32_from_2
-    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: {{  $}}
-    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s16), addrspace 1)
-    ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_USHORT]]
-    ;
-    ; GFX8-LABEL: name: load_global_s32_from_2
-    ; GFX8: liveins: $vgpr0_vgpr1
-    ; GFX8-NEXT: {{  $}}
-    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s16), addrspace 1)
-    ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_USHORT]]
-    ;
-    ; GFX9-LABEL: name: load_global_s32_from_2
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[GLOBAL_LOAD_USHORT:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[COPY]], 0, 0, implicit $exec :: (load (s16), addrspace 1)
-    ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_USHORT]]
-    ;
-    ; GFX10-LABEL: name: load_global_s32_from_2
-    ; GFX10: liveins: $vgpr0_vgpr1
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_USHORT:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[COPY]], 0, 0, implicit $exec :: (load (s16), addrspace 1)
-    ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_USHORT]]
-    ;
-    ; GFX11-LABEL: name: load_global_s32_from_2
-    ; GFX11: liveins: $vgpr0_vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[GLOBAL_LOAD_USHORT:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[COPY]], 0, 0, implicit $exec :: (load (s16), addrspace 1)
-    ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_USHORT]]
-    ;
-    ; GFX12-LABEL: name: load_global_s32_from_2
-    ; GFX12: liveins: $vgpr0_vgpr1
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[GLOBAL_LOAD_USHORT:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[COPY]], 0, 0, implicit $exec :: (load (s16), addrspace 1)
-    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_USHORT]]
-    %0:vgpr(p1) = COPY $vgpr0_vgpr1
-    %1:vgpr(s32) = G_LOAD %0 :: (load (s16), align 2, addrspace 1)
-    $vgpr0 = COPY %1
-
-...
-
----
-
-name: load_global_s32_from_1
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-
-body: |
-  bb.0:
-    liveins:  $vgpr0_vgpr1
-
-    ; GFX6-LABEL: name: load_global_s32_from_1
-    ; GFX6: liveins: $vgpr0_vgpr1
-    ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
-    ;
-    ; GFX7-LABEL: name: load_global_s32_from_1
-    ; GFX7: liveins: $vgpr0_vgpr1
-    ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
-    ;
-    ; GFX7-FLAT-LABEL: name: load_global_s32_from_1
-    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: {{  $}}
-    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
-    ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
-    ;
-    ; GFX8-LABEL: name: load_global_s32_from_1
-    ; GFX8: liveins: $vgpr0_vgpr1
-    ; GFX8-NEXT: {{  $}}
-    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
-    ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
-    ;
-    ; GFX9-LABEL: name: load_global_s32_from_1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX10-LABEL: name: load_global_s32_from_1
-    ; GFX10: liveins: $vgpr0_vgpr1
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX11-LABEL: name: load_global_s32_from_1
-    ; GFX11: liveins: $vgpr0_vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX12-LABEL: name: load_global_s32_from_1
-    ; GFX12: liveins: $vgpr0_vgpr1
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    %0:vgpr(p1) = COPY $vgpr0_vgpr1
-    %1:vgpr(s32) = G_LOAD %0 :: (load (s8), align 1, addrspace 1)
-    $vgpr0 = COPY %1
-
-...
-
----
-
-name: load_global_v2s32
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-
-body: |
-  bb.0:
-    liveins:  $vgpr0_vgpr1
-
-    ; GFX6-LABEL: name: load_global_v2s32
-    ; GFX6: liveins: $vgpr0_vgpr1
-    ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<2 x s32>), addrspace 1)
-    ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]]
-    ;
-    ; GFX7-LABEL: name: load_global_v2s32
-    ; GFX7: liveins: $vgpr0_vgpr1
-    ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<2 x s32>), addrspace 1)
-    ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]]
-    ;
-    ; GFX7-FLAT-LABEL: name: load_global_v2s32
-    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: {{  $}}
-    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s32>), addrspace 1)
-    ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
-    ;
-    ; GFX8-LABEL: name: load_global_v2s32
-    ; GFX8: liveins: $vgpr0_vgpr1
-    ; GFX8-NEXT: {{  $}}
-    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s32>), addrspace 1)
-    ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
-    ;
-    ; GFX9-LABEL: name: load_global_v2s32
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s32>), addrspace 1)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
-    ;
-    ; GFX10-LABEL: name: load_global_v2s32
-    ; GFX10: liveins: $vgpr0_vgpr1
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s32>), addrspace 1)
-    ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
-    ;
-    ; GFX11-LABEL: name: load_global_v2s32
-    ; GFX11: liveins: $vgpr0_vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s32>), addrspace 1)
-    ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
-    ;
-    ; GFX12-LABEL: name: load_global_v2s32
-    ; GFX12: liveins: $vgpr0_vgpr1
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s32>), addrspace 1)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
-    %0:vgpr(p1) = COPY $vgpr0_vgpr1
-    %1:vgpr(<2 x s32>) = G_LOAD %0 :: (load (<2 x s32>), align 8, addrspace 1)
-    $vgpr0_vgpr1 = COPY %1
-
-...
-
----
-
-name: load_global_v4s32
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-
-body: |
-  bb.0:
-    liveins:  $vgpr0_vgpr1
-
-    ; GFX6-LABEL: name: load_global_v4s32
-    ; GFX6: liveins: $vgpr0_vgpr1
-    ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<4 x s32>), align 4, addrspace 1)
-    ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUFFER_LOAD_DWORDX4_ADDR64_]]
-    ;
-    ; GFX7-LABEL: name: load_global_v4s32
-    ; GFX7: liveins: $vgpr0_vgpr1
-    ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<4 x s32>), align 4, addrspace 1)
-    ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUFFER_LOAD_DWORDX4_ADDR64_]]
-    ;
-    ; GFX7-FLAT-LABEL: name: load_global_v4s32
-    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: {{  $}}
-    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s32>), align 4, addrspace 1)
-    ; GFX7-FLAT-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
-    ;
-    ; GFX8-LABEL: name: load_global_v4s32
-    ; GFX8: liveins: $vgpr0_vgpr1
-    ; GFX8-NEXT: {{  $}}
-    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s32>), align 4, addrspace 1)
-    ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
-    ;
-    ; GFX9-LABEL: name: load_global_v4s32
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s32>), align 4, addrspace 1)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]]
-    ;
-    ; GFX10-LABEL: name: load_global_v4s32
-    ; GFX10: liveins: $vgpr0_vgpr1
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s32>), align 4, addrspace 1)
-    ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]]
-    ;
-    ; GFX11-LABEL: name: load_global_v4s32
-    ; GFX11: liveins: $vgpr0_vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s32>), align 4, addrspace 1)
-    ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]]
-    ;
-    ; GFX12-LABEL: name: load_global_v4s32
-    ; GFX12: liveins: $vgpr0_vgpr1
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s32>), align 4, addrspace 1)
-    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]]
-    %0:vgpr(p1) = COPY $vgpr0_vgpr1
-    %1:vgpr(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 4, addrspace 1)
-    $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
-
-...
-
----
-
-name: load_global_s64
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-
-body: |
-  bb.0:
-    liveins:  $vgpr0_vgpr1
-
-    ; GFX6-LABEL: name: load_global_s64
-    ; GFX6: liveins: $vgpr0_vgpr1
-    ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (s64), addrspace 1)
-    ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]]
-    ;
-    ; GFX7-LABEL: name: load_global_s64
-    ; GFX7: liveins: $vgpr0_vgpr1
-    ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (s64), addrspace 1)
-    ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]]
-    ;
-    ; GFX7-FLAT-LABEL: name: load_global_s64
-    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: {{  $}}
-    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64), addrspace 1)
-    ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
-    ;
-    ; GFX8-LABEL: name: load_global_s64
-    ; GFX8: liveins: $vgpr0_vgpr1
-    ; GFX8-NEXT: {{  $}}
-    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64), addrspace 1)
-    ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
-    ;
-    ; GFX9-LABEL: name: load_global_s64
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (s64), addrspace 1)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
-    ;
-    ; GFX10-LABEL: name: load_global_s64
-    ; GFX10: liveins: $vgpr0_vgpr1
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (s64), addrspace 1)
-    ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
-    ;
-    ; GFX11-LABEL: name: load_global_s64
-    ; GFX11: liveins: $vgpr0_vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (s64), addrspace 1)
-    ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
-    ;
-    ; GFX12-LABEL: name: load_global_s64
-    ; GFX12: liveins: $vgpr0_vgpr1
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (s64), addrspace 1)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
-    %0:vgpr(p1) = COPY $vgpr0_vgpr1
-    %1:vgpr(s64) = G_LOAD %0 :: (load (s64), align 8, addrspace 1)
-    $vgpr0_vgpr1 = COPY %1
-
-...
-
----
-
-name: load_global_v2s64
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-
-body: |
-  bb.0:
-    liveins:  $vgpr0_vgpr1
-
-    ; GFX6-LABEL: name: load_global_v2s64
-    ; GFX6: liveins: $vgpr0_vgpr1
-    ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<2 x s64>), align 4, addrspace 1)
-    ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUFFER_LOAD_DWORDX4_ADDR64_]]
-    ;
-    ; GFX7-LABEL: name: load_global_v2s64
-    ; GFX7: liveins: $vgpr0_vgpr1
-    ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<2 x s64>), align 4, addrspace 1)
-    ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUFFER_LOAD_DWORDX4_ADDR64_]]
-    ;
-    ; GFX7-FLAT-LABEL: name: load_global_v2s64
-    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: {{  $}}
-    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s64>), align 4, addrspace 1)
-    ; GFX7-FLAT-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
-    ;
-    ; GFX8-LABEL: name: load_global_v2s64
-    ; GFX8: liveins: $vgpr0_vgpr1
-    ; GFX8-NEXT: {{  $}}
-    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s64>), align 4, addrspace 1)
-    ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
-    ;
-    ; GFX9-LABEL: name: load_global_v2s64
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s64>), align 4, addrspace 1)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]]
-    ;
-    ; GFX10-LABEL: name: load_global_v2s64
-    ; GFX10: liveins: $vgpr0_vgpr1
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s64>), align 4, addrspace 1)
-    ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]]
-    ;
-    ; GFX11-LABEL: name: load_global_v2s64
-    ; GFX11: liveins: $vgpr0_vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s64>), align 4, addrspace 1)
-    ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]]
-    ;
-    ; GFX12-LABEL: name: load_global_v2s64
-    ; GFX12: liveins: $vgpr0_vgpr1
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s64>), align 4, addrspace 1)
-    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]]
-    %0:vgpr(p1) = COPY $vgpr0_vgpr1
-    %1:vgpr(<2 x s64>) = G_LOAD %0 :: (load (<2 x s64>), align 4, addrspace 1)
-    $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
-
-...
-
----
-
-name: load_global_v2p1
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-
-body: |
-  bb.0:
-    liveins:  $vgpr0_vgpr1
-
-    ; GFX6-LABEL: name: load_global_v2p1
-    ; GFX6: liveins: $vgpr0_vgpr1
-    ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<2 x s64>), align 4, addrspace 1)
-    ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUFFER_LOAD_DWORDX4_ADDR64_]]
-    ;
-    ; GFX7-LABEL: name: load_global_v2p1
-    ; GFX7: liveins: $vgpr0_vgpr1
-    ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<2 x s64>), align 4, addrspace 1)
-    ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUFFER_LOAD_DWORDX4_ADDR64_]]
-    ;
-    ; GFX7-FLAT-LABEL: name: load_global_v2p1
-    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: {{  $}}
-    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s64>), align 4, addrspace 1)
-    ; GFX7-FLAT-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
-    ;
-    ; GFX8-LABEL: name: load_global_v2p1
-    ; GFX8: liveins: $vgpr0_vgpr1
-    ; GFX8-NEXT: {{  $}}
-    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s64>), align 4, addrspace 1)
-    ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
-    ;
-    ; GFX9-LABEL: name: load_global_v2p1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s64>), align 4, addrspace 1)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]]
-    ;
-    ; GFX10-LABEL: name: load_global_v2p1
-    ; GFX10: liveins: $vgpr0_vgpr1
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s64>), align 4, addrspace 1)
-    ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]]
-    ;
-    ; GFX11-LABEL: name: load_global_v2p1
-    ; GFX11: liveins: $vgpr0_vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s64>), align 4, addrspace 1)
-    ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]]
-    ;
-    ; GFX12-LABEL: name: load_global_v2p1
-    ; GFX12: liveins: $vgpr0_vgpr1
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s64>), align 4, addrspace 1)
-    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]]
-    %0:vgpr(p1) = COPY $vgpr0_vgpr1
-    %1:vgpr(<2 x s64>) = G_LOAD %0 :: (load (<2 x s64>), align 4, addrspace 1)
-    %2:vgpr(<2 x p1>) = G_BITCAST %1(<2 x s64>)
-    $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %2
-
-...
-
----
-
-name: load_global_s128
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-
-body: |
-  bb.0:
-    liveins:  $vgpr0_vgpr1
-
-    ; GFX6-LABEL: name: load_global_s128
-    ; GFX6: liveins: $vgpr0_vgpr1
-    ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4, addrspace 1)
-    ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
-    ;
-    ; GFX7-LABEL: name: load_global_s128
-    ; GFX7: liveins: $vgpr0_vgpr1
-    ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4, addrspace 1)
-    ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
-    ;
-    ; GFX7-FLAT-LABEL: name: load_global_s128
-    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: {{  $}}
-    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4, addrspace 1)
-    ; GFX7-FLAT-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
-    ;
-    ; GFX8-LABEL: name: load_global_s128
-    ; GFX8: liveins: $vgpr0_vgpr1
-    ; GFX8-NEXT: {{  $}}
-    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vgpr(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4, addrspace 1)
-    ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
-    ;
-    ; GFX9-LABEL: name: load_global_s128
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4, addrspace 1)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
-    ;
-    ; GFX10-LABEL: name: load_global_s128
-    ; GFX10: liveins: $vgpr0_vgpr1
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vgpr(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4, addrspace 1)
-    ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
-    ;
-    ; GFX11-LABEL: name: load_global_s128
-    ; GFX11: liveins: $vgpr0_vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vgpr(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4, addrspace 1)
-    ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
-    ;
-    ; GFX12-LABEL: name: load_global_s128
-    ; GFX12: liveins: $vgpr0_vgpr1
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vgpr(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4, addrspace 1)
-    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
-    %0:vgpr(p1) = COPY $vgpr0_vgpr1
-    %1:vgpr(s128) = G_LOAD %0 :: (load (s128), align 4, addrspace 1)
-    $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
-
-...
-
----
-
-name: load_global_p3_from_4
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-
-body: |
-  bb.0:
-    liveins:  $vgpr0_vgpr1
-
-    ; GFX6-LABEL: name: load_global_p3_from_4
-    ; GFX6: liveins: $vgpr0_vgpr1
-    ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (p3), addrspace 1)
-    ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
-    ;
-    ; GFX7-LABEL: name: load_global_p3_from_4
-    ; GFX7: liveins: $vgpr0_vgpr1
-    ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (p3), addrspace 1)
-    ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
-    ;
-    ; GFX7-FLAT-LABEL: name: load_global_p3_from_4
-    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: {{  $}}
-    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p3), addrspace 1)
-    ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
-    ;
-    ; GFX8-LABEL: name: load_global_p3_from_4
-    ; GFX8: liveins: $vgpr0_vgpr1
-    ; GFX8-NEXT: {{  $}}
-    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p3), addrspace 1)
-    ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
-    ;
-    ; GFX9-LABEL: name: load_global_p3_from_4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (p3), addrspace 1)
-    ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
-    ;
-    ; GFX10-LABEL: name: load_global_p3_from_4
-    ; GFX10: liveins: $vgpr0_vgpr1
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (p3), addrspace 1)
-    ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
-    ;
-    ; GFX11-LABEL: name: load_global_p3_from_4
-    ; GFX11: liveins: $vgpr0_vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (p3), addrspace 1)
-    ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
-    ;
-    ; GFX12-LABEL: name: load_global_p3_from_4
-    ; GFX12: liveins: $vgpr0_vgpr1
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (p3), addrspace 1)
-    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
-    %0:vgpr(p1) = COPY $vgpr0_vgpr1
-    %1:vgpr(p3) = G_LOAD %0 :: (load (p3), align 4, addrspace 1)
-    $vgpr0 = COPY %1
-
-...
-
----
-
-name: load_global_p1_from_8
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-
-body: |
-  bb.0:
-    liveins:  $vgpr0_vgpr1
-
-    ; GFX6-LABEL: name: load_global_p1_from_8
-    ; GFX6: liveins: $vgpr0_vgpr1
-    ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (p1), addrspace 1)
-    ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]]
-    ;
-    ; GFX7-LABEL: name: load_global_p1_from_8
-    ; GFX7: liveins: $vgpr0_vgpr1
-    ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (p1), addrspace 1)
-    ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]]
-    ;
-    ; GFX7-FLAT-LABEL: name: load_global_p1_from_8
-    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: {{  $}}
-    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p1), addrspace 1)
-    ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
-    ;
-    ; GFX8-LABEL: name: load_global_p1_from_8
-    ; GFX8: liveins: $vgpr0_vgpr1
-    ; GFX8-NEXT: {{  $}}
-    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p1), addrspace 1)
-    ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
-    ;
-    ; GFX9-LABEL: name: load_global_p1_from_8
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (p1), addrspace 1)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
-    ;
-    ; GFX10-LABEL: name: load_global_p1_from_8
-    ; GFX10: liveins: $vgpr0_vgpr1
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (p1), addrspace 1)
-    ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
-    ;
-    ; GFX11-LABEL: name: load_global_p1_from_8
-    ; GFX11: liveins: $vgpr0_vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (p1), addrspace 1)
-    ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
-    ;
-    ; GFX12-LABEL: name: load_global_p1_from_8
-    ; GFX12: liveins: $vgpr0_vgpr1
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (p1), addrspace 1)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
-    %0:vgpr(p1) = COPY $vgpr0_vgpr1
-    %1:vgpr(p1) = G_LOAD %0 :: (load (p1), align 8, addrspace 1)
-    $vgpr0_vgpr1 = COPY %1
-
-...
-
----
-
-name: load_global_p999_from_8
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-
-body: |
-  bb.0:
-    liveins:  $vgpr0_vgpr1
-
-    ; GFX6-LABEL: name: load_global_p999_from_8
-    ; GFX6: liveins: $vgpr0_vgpr1
-    ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999), addrspace 1)
-    ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
-    ;
-    ; GFX7-LABEL: name: load_global_p999_from_8
-    ; GFX7: liveins: $vgpr0_vgpr1
-    ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999), addrspace 1)
-    ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
-    ;
-    ; GFX7-FLAT-LABEL: name: load_global_p999_from_8
-    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: {{  $}}
-    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999), addrspace 1)
-    ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
-    ;
-    ; GFX8-LABEL: name: load_global_p999_from_8
-    ; GFX8: liveins: $vgpr0_vgpr1
-    ; GFX8-NEXT: {{  $}}
-    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999), addrspace 1)
-    ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
-    ;
-    ; GFX9-LABEL: name: load_global_p999_from_8
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999), addrspace 1)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
-    ;
-    ; GFX10-LABEL: name: load_global_p999_from_8
-    ; GFX10: liveins: $vgpr0_vgpr1
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999), addrspace 1)
-    ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
-    ;
-    ; GFX11-LABEL: name: load_global_p999_from_8
-    ; GFX11: liveins: $vgpr0_vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999), addrspace 1)
-    ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
-    ;
-    ; GFX12-LABEL: name: load_global_p999_from_8
-    ; GFX12: liveins: $vgpr0_vgpr1
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999), addrspace 1)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
-    %0:vgpr(p1) = COPY $vgpr0_vgpr1
-    %1:vgpr(p999) = G_LOAD %0 :: (load (p999), align 8, addrspace 1)
-    $vgpr0_vgpr1 = COPY %1
-
-...
-
----
-
-name: load_global_v2p3
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-
-body: |
-  bb.0:
-    liveins:  $vgpr0_vgpr1
-
-    ; GFX6-LABEL: name: load_global_v2p3
-    ; GFX6: liveins: $vgpr0_vgpr1
-    ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1)
-    ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
-    ;
-    ; GFX7-LABEL: name: load_global_v2p3
-    ; GFX7: liveins: $vgpr0_vgpr1
-    ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1)
-    ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
-    ;
-    ; GFX7-FLAT-LABEL: name: load_global_v2p3
-    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: {{  $}}
-    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1)
-    ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
-    ;
-    ; GFX8-LABEL: name: load_global_v2p3
-    ; GFX8: liveins: $vgpr0_vgpr1
-    ; GFX8-NEXT: {{  $}}
-    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1)
-    ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
-    ;
-    ; GFX9-LABEL: name: load_global_v2p3
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
-    ;
-    ; GFX10-LABEL: name: load_global_v2p3
-    ; GFX10: liveins: $vgpr0_vgpr1
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1)
-    ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
-    ;
-    ; GFX11-LABEL: name: load_global_v2p3
-    ; GFX11: liveins: $vgpr0_vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1)
-    ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
-    ;
-    ; GFX12-LABEL: name: load_global_v2p3
-    ; GFX12: liveins: $vgpr0_vgpr1
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
-    %0:vgpr(p1) = COPY $vgpr0_vgpr1
-    %1:vgpr(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 8, addrspace 1)
-    $vgpr0_vgpr1 = COPY %1
-
-...
-
----
-
-name: load_global_v2s16
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-
-body: |
-  bb.0:
-    liveins:  $vgpr0_vgpr1
-
-    ; GFX6-LABEL: name: load_global_v2s16
-    ; GFX6: liveins: $vgpr0_vgpr1
-    ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<2 x s16>), addrspace 1)
-    ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
-    ;
-    ; GFX7-LABEL: name: load_global_v2s16
-    ; GFX7: liveins: $vgpr0_vgpr1
-    ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<2 x s16>), addrspace 1)
-    ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
-    ;
-    ; GFX7-FLAT-LABEL: name: load_global_v2s16
-    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: {{  $}}
-    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s16>), addrspace 1)
-    ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
-    ;
-    ; GFX8-LABEL: name: load_global_v2s16
-    ; GFX8: liveins: $vgpr0_vgpr1
-    ; GFX8-NEXT: {{  $}}
-    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s16>), addrspace 1)
-    ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
-    ;
-    ; GFX9-LABEL: name: load_global_v2s16
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (<2 x s16>), addrspace 1)
-    ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
-    ;
-    ; GFX10-LABEL: name: load_global_v2s16
-    ; GFX10: liveins: $vgpr0_vgpr1
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (<2 x s16>), addrspace 1)
-    ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
-    ;
-    ; GFX11-LABEL: name: load_global_v2s16
-    ; GFX11: liveins: $vgpr0_vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (<2 x s16>), addrspace 1)
-    ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
-    ;
-    ; GFX12-LABEL: name: load_global_v2s16
-    ; GFX12: liveins: $vgpr0_vgpr1
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (<2 x s16>), addrspace 1)
-    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
-    %0:vgpr(p1) = COPY $vgpr0_vgpr1
-    %1:vgpr(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 4, addrspace 1)
-    $vgpr0 = COPY %1
-
-...
-
----
-
-name: load_global_v4s16
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-
-body: |
-  bb.0:
-    liveins:  $vgpr0_vgpr1
-
-    ; GFX6-LABEL: name: load_global_v4s16
-    ; GFX6: liveins: $vgpr0_vgpr1
-    ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<4 x s16>), addrspace 1)
-    ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]]
-    ;
-    ; GFX7-LABEL: name: load_global_v4s16
-    ; GFX7: liveins: $vgpr0_vgpr1
-    ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<4 x s16>), addrspace 1)
-    ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]]
-    ;
-    ; GFX7-FLAT-LABEL: name: load_global_v4s16
-    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: {{  $}}
-    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s16>), addrspace 1)
-    ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
-    ;
-    ; GFX8-LABEL: name: load_global_v4s16
-    ; GFX8: liveins: $vgpr0_vgpr1
-    ; GFX8-NEXT: {{  $}}
-    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s16>), addrspace 1)
-    ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
-    ;
-    ; GFX9-LABEL: name: load_global_v4s16
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s16>), addrspace 1)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
-    ;
-    ; GFX10-LABEL: name: load_global_v4s16
-    ; GFX10: liveins: $vgpr0_vgpr1
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s16>), addrspace 1)
-    ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
-    ;
-    ; GFX11-LABEL: name: load_global_v4s16
-    ; GFX11: liveins: $vgpr0_vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s16>), addrspace 1)
-    ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
-    ;
-    ; GFX12-LABEL: name: load_global_v4s16
-    ; GFX12: liveins: $vgpr0_vgpr1
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s16>), addrspace 1)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
-    %0:vgpr(p1) = COPY $vgpr0_vgpr1
-    %1:vgpr(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 8, addrspace 1)
-    $vgpr0_vgpr1 = COPY %1
-
-...
-
----
-
-name: load_global_v8s16
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-
-body: |
-  bb.0:
-    liveins:  $vgpr0_vgpr1
-
-    ; GFX6-LABEL: name: load_global_v8s16
-    ; GFX6: liveins: $vgpr0_vgpr1
-    ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<8 x s16>), align 4, addrspace 1)
-    ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUFFER_LOAD_DWORDX4_ADDR64_]]
-    ;
-    ; GFX7-LABEL: name: load_global_v8s16
-    ; GFX7: liveins: $vgpr0_vgpr1
-    ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load (<8 x s16>), align 4, addrspace 1)
-    ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUFFER_LOAD_DWORDX4_ADDR64_]]
-    ;
-    ; GFX7-FLAT-LABEL: name: load_global_v8s16
-    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: {{  $}}
-    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<8 x s16>), align 4, addrspace 1)
-    ; GFX7-FLAT-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
-    ;
-    ; GFX8-LABEL: name: load_global_v8s16
-    ; GFX8: liveins: $vgpr0_vgpr1
-    ; GFX8-NEXT: {{  $}}
-    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_LOAD [[COPY]](p1) :: (load (<8 x s16>), align 4, addrspace 1)
-    ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<8 x s16>)
-    ;
-    ; GFX9-LABEL: name: load_global_v8s16
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_LOAD [[COPY]](p1) :: (load (<8 x s16>), align 4, addrspace 1)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<8 x s16>)
-    ;
-    ; GFX10-LABEL: name: load_global_v8s16
-    ; GFX10: liveins: $vgpr0_vgpr1
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_LOAD [[COPY]](p1) :: (load (<8 x s16>), align 4, addrspace 1)
-    ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<8 x s16>)
-    ;
-    ; GFX11-LABEL: name: load_global_v8s16
-    ; GFX11: liveins: $vgpr0_vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_LOAD [[COPY]](p1) :: (load (<8 x s16>), align 4, addrspace 1)
-    ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<8 x s16>)
-    ;
-    ; GFX12-LABEL: name: load_global_v8s16
-    ; GFX12: liveins: $vgpr0_vgpr1
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_LOAD [[COPY]](p1) :: (load (<8 x s16>), align 4, addrspace 1)
-    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<8 x s16>)
-    %0:vgpr(p1) = COPY $vgpr0_vgpr1
-    %1:vgpr(<8 x s16>) = G_LOAD %0 :: (load (<8 x s16>), align 4, addrspace 1)
-    $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
-
-...
-
-################################################################################
-### Stress addressing modes
-################################################################################
-
----
-
-name: load_global_s32_from_1_gep_2047
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-
-body: |
-  bb.0:
-    liveins:  $vgpr0_vgpr1
-
-    ; GFX6-LABEL: name: load_global_s32_from_1_gep_2047
-    ; GFX6: liveins: $vgpr0_vgpr1
-    ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
-    ;
-    ; GFX7-LABEL: name: load_global_s32_from_1_gep_2047
-    ; GFX7: liveins: $vgpr0_vgpr1
-    ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
-    ;
-    ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_2047
-    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: {{  $}}
-    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec
-    ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
-    ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
-    ;
-    ; GFX8-LABEL: name: load_global_s32_from_1_gep_2047
-    ; GFX8: liveins: $vgpr0_vgpr1
-    ; GFX8-NEXT: {{  $}}
-    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec
-    ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
-    ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
-    ;
-    ; GFX9-LABEL: name: load_global_s32_from_1_gep_2047
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2047, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX10-LABEL: name: load_global_s32_from_1_gep_2047
-    ; GFX10: liveins: $vgpr0_vgpr1
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2047, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX11-LABEL: name: load_global_s32_from_1_gep_2047
-    ; GFX11: liveins: $vgpr0_vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2047, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX12-LABEL: name: load_global_s32_from_1_gep_2047
-    ; GFX12: liveins: $vgpr0_vgpr1
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2047, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    %0:vgpr(p1) = COPY $vgpr0_vgpr1
-    %1:vgpr(s64) = G_CONSTANT i64 2047
-    %2:vgpr(p1) = G_PTR_ADD %0, %1
-    %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 1)
-    $vgpr0 = COPY %3
-
-...
-
----
-
-name: load_global_s32_from_1_gep_2048
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-
-body: |
-  bb.0:
-    liveins:  $vgpr0_vgpr1
-
-    ; GFX6-LABEL: name: load_global_s32_from_1_gep_2048
-    ; GFX6: liveins: $vgpr0_vgpr1
-    ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 2048, 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
-    ;
-    ; GFX7-LABEL: name: load_global_s32_from_1_gep_2048
-    ; GFX7: liveins: $vgpr0_vgpr1
-    ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 2048, 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
-    ;
-    ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_2048
-    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: {{  $}}
-    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec
-    ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
-    ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
-    ;
-    ; GFX8-LABEL: name: load_global_s32_from_1_gep_2048
-    ; GFX8: liveins: $vgpr0_vgpr1
-    ; GFX8-NEXT: {{  $}}
-    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec
-    ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
-    ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
-    ;
-    ; GFX9-LABEL: name: load_global_s32_from_1_gep_2048
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2048, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX10-LABEL: name: load_global_s32_from_1_gep_2048
-    ; GFX10: liveins: $vgpr0_vgpr1
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX11-LABEL: name: load_global_s32_from_1_gep_2048
-    ; GFX11: liveins: $vgpr0_vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2048, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX12-LABEL: name: load_global_s32_from_1_gep_2048
-    ; GFX12: liveins: $vgpr0_vgpr1
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2048, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    %0:vgpr(p1) = COPY $vgpr0_vgpr1
-    %1:vgpr(s64) = G_CONSTANT i64 2048
-    %2:vgpr(p1) = G_PTR_ADD %0, %1
-    %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 1)
-    $vgpr0 = COPY %3
-
-...
-
----
-
-name: load_global_s32_from_1_gep_m2047
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-
-body: |
-  bb.0:
-    liveins:  $vgpr0_vgpr1
-
-    ; GFX6-LABEL: name: load_global_s32_from_1_gep_m2047
-    ; GFX6: liveins: $vgpr0_vgpr1
-    ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2047, implicit $exec
-    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
-    ;
-    ; GFX7-LABEL: name: load_global_s32_from_1_gep_m2047
-    ; GFX7: liveins: $vgpr0_vgpr1
-    ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2047, implicit $exec
-    ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
-    ;
-    ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m2047
-    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: {{  $}}
-    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2047, implicit $exec
-    ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
-    ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
-    ;
-    ; GFX8-LABEL: name: load_global_s32_from_1_gep_m2047
-    ; GFX8: liveins: $vgpr0_vgpr1
-    ; GFX8-NEXT: {{  $}}
-    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2047, implicit $exec
-    ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
-    ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
-    ;
-    ; GFX9-LABEL: name: load_global_s32_from_1_gep_m2047
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2047, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX10-LABEL: name: load_global_s32_from_1_gep_m2047
-    ; GFX10: liveins: $vgpr0_vgpr1
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2047, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX11-LABEL: name: load_global_s32_from_1_gep_m2047
-    ; GFX11: liveins: $vgpr0_vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2047, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX12-LABEL: name: load_global_s32_from_1_gep_m2047
-    ; GFX12: liveins: $vgpr0_vgpr1
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2047, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    %0:vgpr(p1) = COPY $vgpr0_vgpr1
-    %1:vgpr(s64) = G_CONSTANT i64 -2047
-    %2:vgpr(p1) = G_PTR_ADD %0, %1
-    %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 1)
-    $vgpr0 = COPY %3
-
-...
-
----
-
-name: load_global_s32_from_1_gep_m2048
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-
-body: |
-  bb.0:
-    liveins:  $vgpr0_vgpr1
-
-    ; GFX6-LABEL: name: load_global_s32_from_1_gep_m2048
-    ; GFX6: liveins: $vgpr0_vgpr1
-    ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec
-    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
-    ;
-    ; GFX7-LABEL: name: load_global_s32_from_1_gep_m2048
-    ; GFX7: liveins: $vgpr0_vgpr1
-    ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec
-    ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
-    ;
-    ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m2048
-    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: {{  $}}
-    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec
-    ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
-    ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
-    ;
-    ; GFX8-LABEL: name: load_global_s32_from_1_gep_m2048
-    ; GFX8: liveins: $vgpr0_vgpr1
-    ; GFX8-NEXT: {{  $}}
-    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec
-    ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
-    ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
-    ;
-    ; GFX9-LABEL: name: load_global_s32_from_1_gep_m2048
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2048, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX10-LABEL: name: load_global_s32_from_1_gep_m2048
-    ; GFX10: liveins: $vgpr0_vgpr1
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2048, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX11-LABEL: name: load_global_s32_from_1_gep_m2048
-    ; GFX11: liveins: $vgpr0_vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2048, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX12-LABEL: name: load_global_s32_from_1_gep_m2048
-    ; GFX12: liveins: $vgpr0_vgpr1
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2048, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    %0:vgpr(p1) = COPY $vgpr0_vgpr1
-    %1:vgpr(s64) = G_CONSTANT i64 -2048
-    %2:vgpr(p1) = G_PTR_ADD %0, %1
-    %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 1)
-    $vgpr0 = COPY %3
-
-...
-
----
-
-name: load_global_s32_from_1_gep_4095
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-
-body: |
-  bb.0:
-    liveins:  $vgpr0_vgpr1
-
-    ; GFX6-LABEL: name: load_global_s32_from_1_gep_4095
-    ; GFX6: liveins: $vgpr0_vgpr1
-    ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
-    ;
-    ; GFX7-LABEL: name: load_global_s32_from_1_gep_4095
-    ; GFX7: liveins: $vgpr0_vgpr1
-    ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
-    ;
-    ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_4095
-    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: {{  $}}
-    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
-    ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
-    ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
-    ;
-    ; GFX8-LABEL: name: load_global_s32_from_1_gep_4095
-    ; GFX8: liveins: $vgpr0_vgpr1
-    ; GFX8-NEXT: {{  $}}
-    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
-    ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
-    ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
-    ;
-    ; GFX9-LABEL: name: load_global_s32_from_1_gep_4095
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 4095, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX10-LABEL: name: load_global_s32_from_1_gep_4095
-    ; GFX10: liveins: $vgpr0_vgpr1
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX11-LABEL: name: load_global_s32_from_1_gep_4095
-    ; GFX11: liveins: $vgpr0_vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 4095, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX12-LABEL: name: load_global_s32_from_1_gep_4095
-    ; GFX12: liveins: $vgpr0_vgpr1
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 4095, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    %0:vgpr(p1) = COPY $vgpr0_vgpr1
-    %1:vgpr(s64) = G_CONSTANT i64 4095
-    %2:vgpr(p1) = G_PTR_ADD %0, %1
-    %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 1)
-    $vgpr0 = COPY %3
-
-...
-
----
-
-name: load_global_s32_from_1_gep_4096
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-
-body: |
-  bb.0:
-    liveins:  $vgpr0_vgpr1
-
-    ; GFX6-LABEL: name: load_global_s32_from_1_gep_4096
-    ; GFX6: liveins: $vgpr0_vgpr1
-    ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
-    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
-    ;
-    ; GFX7-LABEL: name: load_global_s32_from_1_gep_4096
-    ; GFX7: liveins: $vgpr0_vgpr1
-    ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
-    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
-    ;
-    ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_4096
-    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: {{  $}}
-    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4096, implicit $exec
-    ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
-    ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
-    ;
-    ; GFX8-LABEL: name: load_global_s32_from_1_gep_4096
-    ; GFX8: liveins: $vgpr0_vgpr1
-    ; GFX8-NEXT: {{  $}}
-    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4096, implicit $exec
-    ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
-    ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
-    ;
-    ; GFX9-LABEL: name: load_global_s32_from_1_gep_4096
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4096, implicit $exec
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX10-LABEL: name: load_global_s32_from_1_gep_4096
-    ; GFX10: liveins: $vgpr0_vgpr1
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4096, implicit $exec
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX11-LABEL: name: load_global_s32_from_1_gep_4096
-    ; GFX11: liveins: $vgpr0_vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4096, implicit $exec
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX12-LABEL: name: load_global_s32_from_1_gep_4096
-    ; GFX12: liveins: $vgpr0_vgpr1
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 4096, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    %0:vgpr(p1) = COPY $vgpr0_vgpr1
-    %1:vgpr(s64) = G_CONSTANT i64 4096
-    %2:vgpr(p1) = G_PTR_ADD %0, %1
-    %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 1)
-    $vgpr0 = COPY %3
-
-...
-
----
-
-name: load_global_s32_from_1_gep_m4095
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-
-body: |
-  bb.0:
-    liveins:  $vgpr0_vgpr1
-
-    ; GFX6-LABEL: name: load_global_s32_from_1_gep_m4095
-    ; GFX6: liveins: $vgpr0_vgpr1
-    ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4095, implicit $exec
-    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
-    ;
-    ; GFX7-LABEL: name: load_global_s32_from_1_gep_m4095
-    ; GFX7: liveins: $vgpr0_vgpr1
-    ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4095, implicit $exec
-    ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
-    ;
-    ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m4095
-    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: {{  $}}
-    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4095, implicit $exec
-    ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
-    ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
-    ;
-    ; GFX8-LABEL: name: load_global_s32_from_1_gep_m4095
-    ; GFX8: liveins: $vgpr0_vgpr1
-    ; GFX8-NEXT: {{  $}}
-    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4095, implicit $exec
-    ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
-    ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
-    ;
-    ; GFX9-LABEL: name: load_global_s32_from_1_gep_m4095
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -4095, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX10-LABEL: name: load_global_s32_from_1_gep_m4095
-    ; GFX10: liveins: $vgpr0_vgpr1
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4095, implicit $exec
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX11-LABEL: name: load_global_s32_from_1_gep_m4095
-    ; GFX11: liveins: $vgpr0_vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -4095, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX12-LABEL: name: load_global_s32_from_1_gep_m4095
-    ; GFX12: liveins: $vgpr0_vgpr1
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -4095, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    %0:vgpr(p1) = COPY $vgpr0_vgpr1
-    %1:vgpr(s64) = G_CONSTANT i64 -4095
-    %2:vgpr(p1) = G_PTR_ADD %0, %1
-    %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 1)
-    $vgpr0 = COPY %3
-
-...
-
----
-
-name: load_global_s32_from_1_gep_m4096
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-
-body: |
-  bb.0:
-    liveins:  $vgpr0_vgpr1
-
-    ; GFX6-LABEL: name: load_global_s32_from_1_gep_m4096
-    ; GFX6: liveins: $vgpr0_vgpr1
-    ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec
-    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
-    ;
-    ; GFX7-LABEL: name: load_global_s32_from_1_gep_m4096
-    ; GFX7: liveins: $vgpr0_vgpr1
-    ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec
-    ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
-    ;
-    ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m4096
-    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: {{  $}}
-    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec
-    ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
-    ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
-    ;
-    ; GFX8-LABEL: name: load_global_s32_from_1_gep_m4096
-    ; GFX8: liveins: $vgpr0_vgpr1
-    ; GFX8-NEXT: {{  $}}
-    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec
-    ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
-    ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
-    ;
-    ; GFX9-LABEL: name: load_global_s32_from_1_gep_m4096
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -4096, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX10-LABEL: name: load_global_s32_from_1_gep_m4096
-    ; GFX10: liveins: $vgpr0_vgpr1
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX11-LABEL: name: load_global_s32_from_1_gep_m4096
-    ; GFX11: liveins: $vgpr0_vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -4096, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX12-LABEL: name: load_global_s32_from_1_gep_m4096
-    ; GFX12: liveins: $vgpr0_vgpr1
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -4096, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    %0:vgpr(p1) = COPY $vgpr0_vgpr1
-    %1:vgpr(s64) = G_CONSTANT i64 -4096
-    %2:vgpr(p1) = G_PTR_ADD %0, %1
-    %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 1)
-    $vgpr0 = COPY %3
-
-...
-
----
-
-name: load_global_s32_from_1_gep_8191
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-
-body: |
-  bb.0:
-    liveins:  $vgpr0_vgpr1
-
-    ; GFX6-LABEL: name: load_global_s32_from_1_gep_8191
-    ; GFX6: liveins: $vgpr0_vgpr1
-    ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 8191
-    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
-    ;
-    ; GFX7-LABEL: name: load_global_s32_from_1_gep_8191
-    ; GFX7: liveins: $vgpr0_vgpr1
-    ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 8191
-    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
-    ;
-    ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_8191
-    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: {{  $}}
-    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8191, implicit $exec
-    ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
-    ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
-    ;
-    ; GFX8-LABEL: name: load_global_s32_from_1_gep_8191
-    ; GFX8: liveins: $vgpr0_vgpr1
-    ; GFX8-NEXT: {{  $}}
-    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8191, implicit $exec
-    ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
-    ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
-    ;
-    ; GFX9-LABEL: name: load_global_s32_from_1_gep_8191
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8191, implicit $exec
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX10-LABEL: name: load_global_s32_from_1_gep_8191
-    ; GFX10: liveins: $vgpr0_vgpr1
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8191, implicit $exec
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX11-LABEL: name: load_global_s32_from_1_gep_8191
-    ; GFX11: liveins: $vgpr0_vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8191, implicit $exec
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX12-LABEL: name: load_global_s32_from_1_gep_8191
-    ; GFX12: liveins: $vgpr0_vgpr1
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 8191, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    %0:vgpr(p1) = COPY $vgpr0_vgpr1
-    %1:vgpr(s64) = G_CONSTANT i64 8191
-    %2:vgpr(p1) = G_PTR_ADD %0, %1
-    %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 1)
-    $vgpr0 = COPY %3
-
-...
-
----
-
-name: load_global_s32_from_1_gep_8192
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-
-body: |
-  bb.0:
-    liveins:  $vgpr0_vgpr1
-
-    ; GFX6-LABEL: name: load_global_s32_from_1_gep_8192
-    ; GFX6: liveins: $vgpr0_vgpr1
-    ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 8192
-    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
-    ;
-    ; GFX7-LABEL: name: load_global_s32_from_1_gep_8192
-    ; GFX7: liveins: $vgpr0_vgpr1
-    ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 8192
-    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
-    ;
-    ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_8192
-    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: {{  $}}
-    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8192, implicit $exec
-    ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
-    ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
-    ;
-    ; GFX8-LABEL: name: load_global_s32_from_1_gep_8192
-    ; GFX8: liveins: $vgpr0_vgpr1
-    ; GFX8-NEXT: {{  $}}
-    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8192, implicit $exec
-    ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
-    ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
-    ;
-    ; GFX9-LABEL: name: load_global_s32_from_1_gep_8192
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8192, implicit $exec
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX10-LABEL: name: load_global_s32_from_1_gep_8192
-    ; GFX10: liveins: $vgpr0_vgpr1
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8192, implicit $exec
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX11-LABEL: name: load_global_s32_from_1_gep_8192
-    ; GFX11: liveins: $vgpr0_vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8192, implicit $exec
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX12-LABEL: name: load_global_s32_from_1_gep_8192
-    ; GFX12: liveins: $vgpr0_vgpr1
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 8192, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    %0:vgpr(p1) = COPY $vgpr0_vgpr1
-    %1:vgpr(s64) = G_CONSTANT i64 8192
-    %2:vgpr(p1) = G_PTR_ADD %0, %1
-    %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 1)
-    $vgpr0 = COPY %3
-
-...
-
----
-
-name: load_global_s32_from_1_gep_m8191
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-
-body: |
-  bb.0:
-    liveins:  $vgpr0_vgpr1
-
-    ; GFX6-LABEL: name: load_global_s32_from_1_gep_m8191
-    ; GFX6: liveins: $vgpr0_vgpr1
-    ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec
-    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
-    ;
-    ; GFX7-LABEL: name: load_global_s32_from_1_gep_m8191
-    ; GFX7: liveins: $vgpr0_vgpr1
-    ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec
-    ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
-    ;
-    ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m8191
-    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: {{  $}}
-    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec
-    ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
-    ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
-    ;
-    ; GFX8-LABEL: name: load_global_s32_from_1_gep_m8191
-    ; GFX8: liveins: $vgpr0_vgpr1
-    ; GFX8-NEXT: {{  $}}
-    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec
-    ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
-    ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
-    ;
-    ; GFX9-LABEL: name: load_global_s32_from_1_gep_m8191
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX10-LABEL: name: load_global_s32_from_1_gep_m8191
-    ; GFX10: liveins: $vgpr0_vgpr1
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX11-LABEL: name: load_global_s32_from_1_gep_m8191
-    ; GFX11: liveins: $vgpr0_vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX12-LABEL: name: load_global_s32_from_1_gep_m8191
-    ; GFX12: liveins: $vgpr0_vgpr1
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -8191, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    %0:vgpr(p1) = COPY $vgpr0_vgpr1
-    %1:vgpr(s64) = G_CONSTANT i64 -8191
-    %2:vgpr(p1) = G_PTR_ADD %0, %1
-    %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 1)
-    $vgpr0 = COPY %3
-
-...
-
----
-
-name: load_global_s32_from_1_gep_m8192
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-
-body: |
-  bb.0:
-    liveins:  $vgpr0_vgpr1
-
-    ; GFX6-LABEL: name: load_global_s32_from_1_gep_m8192
-    ; GFX6: liveins: $vgpr0_vgpr1
-    ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec
-    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
-    ;
-    ; GFX7-LABEL: name: load_global_s32_from_1_gep_m8192
-    ; GFX7: liveins: $vgpr0_vgpr1
-    ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec
-    ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
-    ;
-    ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m8192
-    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: {{  $}}
-    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec
-    ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
-    ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
-    ;
-    ; GFX8-LABEL: name: load_global_s32_from_1_gep_m8192
-    ; GFX8: liveins: $vgpr0_vgpr1
-    ; GFX8-NEXT: {{  $}}
-    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec
-    ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
-    ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
-    ;
-    ; GFX9-LABEL: name: load_global_s32_from_1_gep_m8192
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX10-LABEL: name: load_global_s32_from_1_gep_m8192
-    ; GFX10: liveins: $vgpr0_vgpr1
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX11-LABEL: name: load_global_s32_from_1_gep_m8192
-    ; GFX11: liveins: $vgpr0_vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX12-LABEL: name: load_global_s32_from_1_gep_m8192
-    ; GFX12: liveins: $vgpr0_vgpr1
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -8192, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    %0:vgpr(p1) = COPY $vgpr0_vgpr1
-    %1:vgpr(s64) = G_CONSTANT i64 -8192
-    %2:vgpr(p1) = G_PTR_ADD %0, %1
-    %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 1)
-    $vgpr0 = COPY %3
-
-...
-
----
-
-name: load_global_s32_from_1_gep_24bit_max
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-
-body: |
-  bb.0:
-    liveins:  $vgpr0_vgpr1
-
-    ; GFX6-LABEL: name: load_global_s32_from_1_gep_24bit_max
-    ; GFX6: liveins: $vgpr0_vgpr1
-    ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 8388607
-    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
-    ;
-    ; GFX7-LABEL: name: load_global_s32_from_1_gep_24bit_max
-    ; GFX7: liveins: $vgpr0_vgpr1
-    ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 8388607
-    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
-    ;
-    ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_24bit_max
-    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: {{  $}}
-    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec
-    ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
-    ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
-    ;
-    ; GFX8-LABEL: name: load_global_s32_from_1_gep_24bit_max
-    ; GFX8: liveins: $vgpr0_vgpr1
-    ; GFX8-NEXT: {{  $}}
-    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec
-    ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
-    ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
-    ;
-    ; GFX9-LABEL: name: load_global_s32_from_1_gep_24bit_max
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX10-LABEL: name: load_global_s32_from_1_gep_24bit_max
-    ; GFX10: liveins: $vgpr0_vgpr1
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX11-LABEL: name: load_global_s32_from_1_gep_24bit_max
-    ; GFX11: liveins: $vgpr0_vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8388607, implicit $exec
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX12-LABEL: name: load_global_s32_from_1_gep_24bit_max
-    ; GFX12: liveins: $vgpr0_vgpr1
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 8388607, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    %0:vgpr(p1) = COPY $vgpr0_vgpr1
-    %1:vgpr(s64) = G_CONSTANT i64 8388607
-    %2:vgpr(p1) = G_PTR_ADD %0, %1
-    %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 1)
-    $vgpr0 = COPY %3
-
-...
-
----
-
-name: load_global_s32_from_1_gep_2x_24bit_max
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-
-body: |
-  bb.0:
-    liveins:  $vgpr0_vgpr1
-
-    ; GFX6-LABEL: name: load_global_s32_from_1_gep_2x_24bit_max
-    ; GFX6: liveins: $vgpr0_vgpr1
-    ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 16777214
-    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
-    ;
-    ; GFX7-LABEL: name: load_global_s32_from_1_gep_2x_24bit_max
-    ; GFX7: liveins: $vgpr0_vgpr1
-    ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 16777214
-    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
-    ;
-    ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_2x_24bit_max
-    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: {{  $}}
-    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec
-    ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
-    ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
-    ;
-    ; GFX8-LABEL: name: load_global_s32_from_1_gep_2x_24bit_max
-    ; GFX8: liveins: $vgpr0_vgpr1
-    ; GFX8-NEXT: {{  $}}
-    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec
-    ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
-    ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
-    ;
-    ; GFX9-LABEL: name: load_global_s32_from_1_gep_2x_24bit_max
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX10-LABEL: name: load_global_s32_from_1_gep_2x_24bit_max
-    ; GFX10: liveins: $vgpr0_vgpr1
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX11-LABEL: name: load_global_s32_from_1_gep_2x_24bit_max
-    ; GFX11: liveins: $vgpr0_vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX12-LABEL: name: load_global_s32_from_1_gep_2x_24bit_max
-    ; GFX12: liveins: $vgpr0_vgpr1
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 16777214, implicit $exec
-    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX12-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    %0:vgpr(p1) = COPY $vgpr0_vgpr1
-    %1:vgpr(s64) = G_CONSTANT i64 16777214
-    %2:vgpr(p1) = G_PTR_ADD %0, %1
-    %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 1)
-    $vgpr0 = COPY %3
-
-...
-
----
-
-name: load_global_s32_from_1_gep_24bit_min
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-
-body: |
-  bb.0:
-    liveins:  $vgpr0_vgpr1
-
-    ; GFX6-LABEL: name: load_global_s32_from_1_gep_24bit_min
-    ; GFX6: liveins: $vgpr0_vgpr1
-    ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec
-    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
-    ;
-    ; GFX7-LABEL: name: load_global_s32_from_1_gep_24bit_min
-    ; GFX7: liveins: $vgpr0_vgpr1
-    ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec
-    ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
-    ;
-    ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_24bit_min
-    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: {{  $}}
-    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec
-    ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
-    ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
-    ;
-    ; GFX8-LABEL: name: load_global_s32_from_1_gep_24bit_min
-    ; GFX8: liveins: $vgpr0_vgpr1
-    ; GFX8-NEXT: {{  $}}
-    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec
-    ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
-    ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
-    ;
-    ; GFX9-LABEL: name: load_global_s32_from_1_gep_24bit_min
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX10-LABEL: name: load_global_s32_from_1_gep_24bit_min
-    ; GFX10: liveins: $vgpr0_vgpr1
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX11-LABEL: name: load_global_s32_from_1_gep_24bit_min
-    ; GFX11: liveins: $vgpr0_vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8388608, implicit $exec
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX12-LABEL: name: load_global_s32_from_1_gep_24bit_min
-    ; GFX12: liveins: $vgpr0_vgpr1
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -8388608, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    %0:vgpr(p1) = COPY $vgpr0_vgpr1
-    %1:vgpr(s64) = G_CONSTANT i64 -8388608
-    %2:vgpr(p1) = G_PTR_ADD %0, %1
-    %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 1)
-    $vgpr0 = COPY %3
-
-...
-
----
-
-name: load_global_s32_from_1_gep_2x_24bit_min
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-
-body: |
-  bb.0:
-    liveins:  $vgpr0_vgpr1
-
-    ; GFX6-LABEL: name: load_global_s32_from_1_gep_2x_24bit_min
-    ; GFX6: liveins: $vgpr0_vgpr1
-    ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec
-    ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
-    ;
-    ; GFX7-LABEL: name: load_global_s32_from_1_gep_2x_24bit_min
-    ; GFX7: liveins: $vgpr0_vgpr1
-    ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec
-    ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
-    ;
-    ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_2x_24bit_min
-    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: {{  $}}
-    ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec
-    ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
-    ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
-    ;
-    ; GFX8-LABEL: name: load_global_s32_from_1_gep_2x_24bit_min
-    ; GFX8: liveins: $vgpr0_vgpr1
-    ; GFX8-NEXT: {{  $}}
-    ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec
-    ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
-    ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
-    ;
-    ; GFX9-LABEL: name: load_global_s32_from_1_gep_2x_24bit_min
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX10-LABEL: name: load_global_s32_from_1_gep_2x_24bit_min
-    ; GFX10: liveins: $vgpr0_vgpr1
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX11-LABEL: name: load_global_s32_from_1_gep_2x_24bit_min
-    ; GFX11: liveins: $vgpr0_vgpr1
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    ;
-    ; GFX12-LABEL: name: load_global_s32_from_1_gep_2x_24bit_min
-    ; GFX12: liveins: $vgpr0_vgpr1
-    ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -16777215, implicit $exec
-    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
-    ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
-    ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX12-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX12-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
-    ; GFX12-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
-    %0:vgpr(p1) = COPY $vgpr0_vgpr1
-    %1:vgpr(s64) = G_CONSTANT i64 -16777215
-    %2:vgpr(p1) = G_PTR_ADD %0, %1
-    %3:vgpr(s32) = G_LOAD %2 :: (load (s8), align 1, addrspace 1)
-    $vgpr0 = COPY %3
-
-...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local.mir
index 59c57a5fefbed9..280c7a5a492da8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local.mir
@@ -1,7 +1,7 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs  -global-isel-abort=0 -disable-gisel-legality-check -o - %s | FileCheck -check-prefix=GFX6 %s
-# RUN: llc -mtriple=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs  -global-isel-abort=0 -disable-gisel-legality-check -o - %s | FileCheck -check-prefix=GFX7 %s
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs  -global-isel-abort=0 -disable-gisel-legality-check -o - %s | FileCheck -check-prefix=GFX7 %s
+# RUN: llc -mtriple=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs  -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX7 %s
+# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs  -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX7 %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+cumode -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
@@ -24,7 +24,6 @@ body: |
     ; GFX6-NEXT: $m0 = S_MOV_B32 -1
     ; GFX6-NEXT: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s32), addrspace 3)
     ; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_B32_]]
-    ;
     ; GFX7-LABEL: name: load_local_s32_from_4
     ; GFX7: liveins: $vgpr0
     ; GFX7-NEXT: {{  $}}
@@ -32,14 +31,12 @@ body: |
     ; GFX7-NEXT: $m0 = S_MOV_B32 -1
     ; GFX7-NEXT: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s32), addrspace 3)
     ; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_B32_]]
-    ;
     ; GFX9-LABEL: name: load_local_s32_from_4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY]], 0, 0, implicit $exec :: (load (s32), addrspace 3)
     ; GFX9-NEXT: $vgpr0 = COPY [[DS_READ_B32_gfx9_]]
-    ;
     ; GFX10-LABEL: name: load_local_s32_from_4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -70,7 +67,6 @@ body: |
     ; GFX6-NEXT: $m0 = S_MOV_B32 -1
     ; GFX6-NEXT: [[DS_READ_U16_:%[0-9]+]]:vgpr_32 = DS_READ_U16 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s16), addrspace 3)
     ; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_U16_]]
-    ;
     ; GFX7-LABEL: name: load_local_s32_from_2
     ; GFX7: liveins: $vgpr0
     ; GFX7-NEXT: {{  $}}
@@ -78,14 +74,12 @@ body: |
     ; GFX7-NEXT: $m0 = S_MOV_B32 -1
     ; GFX7-NEXT: [[DS_READ_U16_:%[0-9]+]]:vgpr_32 = DS_READ_U16 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s16), addrspace 3)
     ; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_U16_]]
-    ;
     ; GFX9-LABEL: name: load_local_s32_from_2
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[DS_READ_U16_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[COPY]], 0, 0, implicit $exec :: (load (s16), addrspace 3)
     ; GFX9-NEXT: $vgpr0 = COPY [[DS_READ_U16_gfx9_]]
-    ;
     ; GFX10-LABEL: name: load_local_s32_from_2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -119,7 +113,6 @@ body: |
     ; GFX6-NEXT: $m0 = S_MOV_B32 -1
     ; GFX6-NEXT: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3)
     ; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_U8_]]
-    ;
     ; GFX7-LABEL: name: load_local_s32_from_1
     ; GFX7: liveins: $vgpr0
     ; GFX7-NEXT: {{  $}}
@@ -127,14 +120,12 @@ body: |
     ; GFX7-NEXT: $m0 = S_MOV_B32 -1
     ; GFX7-NEXT: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3)
     ; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_U8_]]
-    ;
     ; GFX9-LABEL: name: load_local_s32_from_1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[DS_READ_U8_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[COPY]], 0, 0, implicit $exec :: (load (s8), addrspace 3)
     ; GFX9-NEXT: $vgpr0 = COPY [[DS_READ_U8_gfx9_]]
-    ;
     ; GFX10-LABEL: name: load_local_s32_from_1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -165,7 +156,6 @@ body: |
     ; GFX6-NEXT: $m0 = S_MOV_B32 -1
     ; GFX6-NEXT: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (<2 x s32>), addrspace 3)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]]
-    ;
     ; GFX7-LABEL: name: load_local_v2s32
     ; GFX7: liveins: $vgpr0
     ; GFX7-NEXT: {{  $}}
@@ -173,14 +163,12 @@ body: |
     ; GFX7-NEXT: $m0 = S_MOV_B32 -1
     ; GFX7-NEXT: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (<2 x s32>), addrspace 3)
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]]
-    ;
     ; GFX9-LABEL: name: load_local_v2s32
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s32>), addrspace 3)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ_B64_gfx9_]]
-    ;
     ; GFX10-LABEL: name: load_local_v2s32
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -211,7 +199,6 @@ body: |
     ; GFX6-NEXT: $m0 = S_MOV_B32 -1
     ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), align 4, addrspace 3)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
-    ;
     ; GFX7-LABEL: name: load_local_v2s32_align4
     ; GFX7: liveins: $vgpr0
     ; GFX7-NEXT: {{  $}}
@@ -219,14 +206,12 @@ body: |
     ; GFX7-NEXT: $m0 = S_MOV_B32 -1
     ; GFX7-NEXT: [[DS_READ2_B32_:%[0-9]+]]:vreg_64 = DS_READ2_B32 [[COPY]], 0, 1, 0, implicit $m0, implicit $exec :: (load (<2 x s32>), align 4, addrspace 3)
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_]]
-    ;
     ; GFX9-LABEL: name: load_local_v2s32_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load (<2 x s32>), align 4, addrspace 3)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_gfx9_]]
-    ;
     ; GFX10-LABEL: name: load_local_v2s32_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -257,7 +242,6 @@ body: |
     ; GFX6-NEXT: $m0 = S_MOV_B32 -1
     ; GFX6-NEXT: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s64), addrspace 3)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]]
-    ;
     ; GFX7-LABEL: name: load_local_s64
     ; GFX7: liveins: $vgpr0
     ; GFX7-NEXT: {{  $}}
@@ -265,14 +249,12 @@ body: |
     ; GFX7-NEXT: $m0 = S_MOV_B32 -1
     ; GFX7-NEXT: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (s64), addrspace 3)
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]]
-    ;
     ; GFX9-LABEL: name: load_local_s64
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 [[COPY]], 0, 0, implicit $exec :: (load (s64), addrspace 3)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ_B64_gfx9_]]
-    ;
     ; GFX10-LABEL: name: load_local_s64
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -303,7 +285,6 @@ body: |
     ; GFX6-NEXT: $m0 = S_MOV_B32 -1
     ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(s64) = G_LOAD [[COPY]](p3) :: (load (s64), align 4, addrspace 3)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
-    ;
     ; GFX7-LABEL: name: load_local_s64_align4
     ; GFX7: liveins: $vgpr0
     ; GFX7-NEXT: {{  $}}
@@ -311,14 +292,12 @@ body: |
     ; GFX7-NEXT: $m0 = S_MOV_B32 -1
     ; GFX7-NEXT: [[DS_READ2_B32_:%[0-9]+]]:vreg_64 = DS_READ2_B32 [[COPY]], 0, 1, 0, implicit $m0, implicit $exec :: (load (s64), align 4, addrspace 3)
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_]]
-    ;
     ; GFX9-LABEL: name: load_local_s64_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load (s64), align 4, addrspace 3)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_gfx9_]]
-    ;
     ; GFX10-LABEL: name: load_local_s64_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -349,7 +328,6 @@ body: |
     ; GFX6-NEXT: $m0 = S_MOV_B32 -1
     ; GFX6-NEXT: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (p3), addrspace 3)
     ; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_B32_]]
-    ;
     ; GFX7-LABEL: name: load_local_p3_from_4
     ; GFX7: liveins: $vgpr0
     ; GFX7-NEXT: {{  $}}
@@ -357,14 +335,12 @@ body: |
     ; GFX7-NEXT: $m0 = S_MOV_B32 -1
     ; GFX7-NEXT: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (p3), addrspace 3)
     ; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_B32_]]
-    ;
     ; GFX9-LABEL: name: load_local_p3_from_4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY]], 0, 0, implicit $exec :: (load (p3), addrspace 3)
     ; GFX9-NEXT: $vgpr0 = COPY [[DS_READ_B32_gfx9_]]
-    ;
     ; GFX10-LABEL: name: load_local_p3_from_4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -395,7 +371,6 @@ body: |
     ; GFX6-NEXT: $m0 = S_MOV_B32 -1
     ; GFX6-NEXT: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (p5), addrspace 3)
     ; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_B32_]]
-    ;
     ; GFX7-LABEL: name: load_local_p5_from_4
     ; GFX7: liveins: $vgpr0
     ; GFX7-NEXT: {{  $}}
@@ -403,14 +378,12 @@ body: |
     ; GFX7-NEXT: $m0 = S_MOV_B32 -1
     ; GFX7-NEXT: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (p5), addrspace 3)
     ; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_B32_]]
-    ;
     ; GFX9-LABEL: name: load_local_p5_from_4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY]], 0, 0, implicit $exec :: (load (p5), addrspace 3)
     ; GFX9-NEXT: $vgpr0 = COPY [[DS_READ_B32_gfx9_]]
-    ;
     ; GFX10-LABEL: name: load_local_p5_from_4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -441,7 +414,6 @@ body: |
     ; GFX6-NEXT: $m0 = S_MOV_B32 -1
     ; GFX6-NEXT: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (p1), addrspace 3)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]]
-    ;
     ; GFX7-LABEL: name: load_local_p1_align8
     ; GFX7: liveins: $vgpr0
     ; GFX7-NEXT: {{  $}}
@@ -449,14 +421,12 @@ body: |
     ; GFX7-NEXT: $m0 = S_MOV_B32 -1
     ; GFX7-NEXT: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (p1), addrspace 3)
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]]
-    ;
     ; GFX9-LABEL: name: load_local_p1_align8
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 [[COPY]], 0, 0, implicit $exec :: (load (p1), addrspace 3)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ_B64_gfx9_]]
-    ;
     ; GFX10-LABEL: name: load_local_p1_align8
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -487,7 +457,6 @@ body: |
     ; GFX6-NEXT: $m0 = S_MOV_B32 -1
     ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p3) :: (load (p1), align 4, addrspace 3)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
-    ;
     ; GFX7-LABEL: name: load_local_p1_align4
     ; GFX7: liveins: $vgpr0
     ; GFX7-NEXT: {{  $}}
@@ -495,14 +464,12 @@ body: |
     ; GFX7-NEXT: $m0 = S_MOV_B32 -1
     ; GFX7-NEXT: [[DS_READ2_B32_:%[0-9]+]]:vreg_64 = DS_READ2_B32 [[COPY]], 0, 1, 0, implicit $m0, implicit $exec :: (load (p1), align 4, addrspace 3)
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_]]
-    ;
     ; GFX9-LABEL: name: load_local_p1_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load (p1), align 4, addrspace 3)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_gfx9_]]
-    ;
     ; GFX10-LABEL: name: load_local_p1_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -533,7 +500,6 @@ body: |
     ; GFX6-NEXT: $m0 = S_MOV_B32 -1
     ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p3) :: (load (p999), addrspace 3)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
-    ;
     ; GFX7-LABEL: name: load_local_p999_from_8
     ; GFX7: liveins: $vgpr0
     ; GFX7-NEXT: {{  $}}
@@ -541,14 +507,12 @@ body: |
     ; GFX7-NEXT: $m0 = S_MOV_B32 -1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p3) :: (load (p999), addrspace 3)
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
-    ;
     ; GFX9-LABEL: name: load_local_p999_from_8
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p3) :: (load (p999), addrspace 3)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
-    ;
     ; GFX10-LABEL: name: load_local_p999_from_8
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -575,36 +539,32 @@ body: |
     ; GFX6-LABEL: name: load_local_v2p3
     ; GFX6: liveins: $vgpr0
     ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
     ; GFX6-NEXT: $m0 = S_MOV_B32 -1
-    ; GFX6-NEXT: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (<2 x s32>), addrspace 3)
-    ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]]
-    ;
+    ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load (<2 x p3>), addrspace 3)
+    ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ; GFX7-LABEL: name: load_local_v2p3
     ; GFX7: liveins: $vgpr0
     ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
     ; GFX7-NEXT: $m0 = S_MOV_B32 -1
-    ; GFX7-NEXT: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (<2 x s32>), addrspace 3)
-    ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]]
-    ;
+    ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load (<2 x p3>), addrspace 3)
+    ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ; GFX9-LABEL: name: load_local_v2p3
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX9-NEXT: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s32>), addrspace 3)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ_B64_gfx9_]]
-    ;
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load (<2 x p3>), addrspace 3)
+    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ; GFX10-LABEL: name: load_local_v2p3
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX10-NEXT: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s32>), addrspace 3)
-    ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ_B64_gfx9_]]
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load (<2 x p3>), addrspace 3)
+    ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     %0:vgpr(p3) = COPY $vgpr0
-    %1:vgpr(<2 x s32>) = G_LOAD %0 :: (load (<2 x s32>), align 8, addrspace 3)
-    %2:vgpr(<2 x p3>) = G_BITCAST %1(<2 x s32>)
-    $vgpr0_vgpr1 = COPY %2
+    %1:vgpr(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 8, addrspace 3)
+    $vgpr0_vgpr1 = COPY %1
 
 ...
 
@@ -626,7 +586,6 @@ body: |
     ; GFX6-NEXT: $m0 = S_MOV_B32 -1
     ; GFX6-NEXT: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (<2 x s16>), addrspace 3)
     ; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_B32_]]
-    ;
     ; GFX7-LABEL: name: load_local_v2s16
     ; GFX7: liveins: $vgpr0
     ; GFX7-NEXT: {{  $}}
@@ -634,14 +593,12 @@ body: |
     ; GFX7-NEXT: $m0 = S_MOV_B32 -1
     ; GFX7-NEXT: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (<2 x s16>), addrspace 3)
     ; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_B32_]]
-    ;
     ; GFX9-LABEL: name: load_local_v2s16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY]], 0, 0, implicit $exec :: (load (<2 x s16>), addrspace 3)
     ; GFX9-NEXT: $vgpr0 = COPY [[DS_READ_B32_gfx9_]]
-    ;
     ; GFX10-LABEL: name: load_local_v2s16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -672,7 +629,6 @@ body: |
     ; GFX6-NEXT: $m0 = S_MOV_B32 -1
     ; GFX6-NEXT: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (<4 x s16>), addrspace 3)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]]
-    ;
     ; GFX7-LABEL: name: load_local_v4s16
     ; GFX7: liveins: $vgpr0
     ; GFX7-NEXT: {{  $}}
@@ -680,14 +636,12 @@ body: |
     ; GFX7-NEXT: $m0 = S_MOV_B32 -1
     ; GFX7-NEXT: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load (<4 x s16>), addrspace 3)
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]]
-    ;
     ; GFX9-LABEL: name: load_local_v4s16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 [[COPY]], 0, 0, implicit $exec :: (load (<4 x s16>), addrspace 3)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ_B64_gfx9_]]
-    ;
     ; GFX10-LABEL: name: load_local_v4s16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -744,7 +698,6 @@ body: |
     ; GFX6-NEXT: $m0 = S_MOV_B32 -1
     ; GFX6-NEXT: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[V_ADD_CO_U32_e64_]], 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3)
     ; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_U8_]]
-    ;
     ; GFX7-LABEL: name: load_local_s32_from_1_gep_65535
     ; GFX7: liveins: $vgpr0
     ; GFX7-NEXT: {{  $}}
@@ -752,14 +705,12 @@ body: |
     ; GFX7-NEXT: $m0 = S_MOV_B32 -1
     ; GFX7-NEXT: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[COPY]], 65535, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3)
     ; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_U8_]]
-    ;
     ; GFX9-LABEL: name: load_local_s32_from_1_gep_65535
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[DS_READ_U8_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[COPY]], 65535, 0, implicit $exec :: (load (s8), addrspace 3)
     ; GFX9-NEXT: $vgpr0 = COPY [[DS_READ_U8_gfx9_]]
-    ;
     ; GFX10-LABEL: name: load_local_s32_from_1_gep_65535
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -794,7 +745,6 @@ body: |
     ; GFX6-NEXT: $m0 = S_MOV_B32 -1
     ; GFX6-NEXT: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[V_AND_B32_e64_]], 65535, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3)
     ; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_U8_]]
-    ;
     ; GFX7-LABEL: name: load_local_s32_from_1_gep_65535_known_bits_base_address
     ; GFX7: liveins: $vgpr0
     ; GFX7-NEXT: {{  $}}
@@ -804,7 +754,6 @@ body: |
     ; GFX7-NEXT: $m0 = S_MOV_B32 -1
     ; GFX7-NEXT: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[V_AND_B32_e64_]], 65535, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3)
     ; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_U8_]]
-    ;
     ; GFX9-LABEL: name: load_local_s32_from_1_gep_65535_known_bits_base_address
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -813,7 +762,6 @@ body: |
     ; GFX9-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec
     ; GFX9-NEXT: [[DS_READ_U8_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[V_AND_B32_e64_]], 65535, 0, implicit $exec :: (load (s8), addrspace 3)
     ; GFX9-NEXT: $vgpr0 = COPY [[DS_READ_U8_gfx9_]]
-    ;
     ; GFX10-LABEL: name: load_local_s32_from_1_gep_65535_known_bits_base_address
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -853,7 +801,6 @@ body: |
     ; GFX6-NEXT: $m0 = S_MOV_B32 -1
     ; GFX6-NEXT: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[V_ADD_CO_U32_e64_]], 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3)
     ; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_U8_]]
-    ;
     ; GFX7-LABEL: name: load_local_s32_from_1_gep_65536
     ; GFX7: liveins: $vgpr0
     ; GFX7-NEXT: {{  $}}
@@ -863,7 +810,6 @@ body: |
     ; GFX7-NEXT: $m0 = S_MOV_B32 -1
     ; GFX7-NEXT: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[V_ADD_CO_U32_e64_]], 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3)
     ; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_U8_]]
-    ;
     ; GFX9-LABEL: name: load_local_s32_from_1_gep_65536
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -872,7 +818,6 @@ body: |
     ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX9-NEXT: [[DS_READ_U8_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[V_ADD_U32_e64_]], 0, 0, implicit $exec :: (load (s8), addrspace 3)
     ; GFX9-NEXT: $vgpr0 = COPY [[DS_READ_U8_gfx9_]]
-    ;
     ; GFX10-LABEL: name: load_local_s32_from_1_gep_65536
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -909,7 +854,6 @@ body: |
     ; GFX6-NEXT: $m0 = S_MOV_B32 -1
     ; GFX6-NEXT: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[V_ADD_CO_U32_e64_]], 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3)
     ; GFX6-NEXT: $vgpr0 = COPY [[DS_READ_U8_]]
-    ;
     ; GFX7-LABEL: name: load_local_s32_from_1_gep_m1
     ; GFX7: liveins: $vgpr0
     ; GFX7-NEXT: {{  $}}
@@ -919,7 +863,6 @@ body: |
     ; GFX7-NEXT: $m0 = S_MOV_B32 -1
     ; GFX7-NEXT: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[V_ADD_CO_U32_e64_]], 0, 0, implicit $m0, implicit $exec :: (load (s8), addrspace 3)
     ; GFX7-NEXT: $vgpr0 = COPY [[DS_READ_U8_]]
-    ;
     ; GFX9-LABEL: name: load_local_s32_from_1_gep_m1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -928,7 +871,6 @@ body: |
     ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX9-NEXT: [[DS_READ_U8_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[V_ADD_U32_e64_]], 0, 0, implicit $exec :: (load (s8), addrspace 3)
     ; GFX9-NEXT: $vgpr0 = COPY [[DS_READ_U8_gfx9_]]
-    ;
     ; GFX10-LABEL: name: load_local_s32_from_1_gep_m1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -965,7 +907,6 @@ body: |
     ; GFX6-NEXT: $m0 = S_MOV_B32 -1
     ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(s64) = G_LOAD [[PTR_ADD]](p3) :: (load (s64), align 4, addrspace 3)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
-    ;
     ; GFX7-LABEL: name: load_local_s64_align4_from_1_gep_1016
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
@@ -973,14 +914,12 @@ body: |
     ; GFX7-NEXT: $m0 = S_MOV_B32 -1
     ; GFX7-NEXT: [[DS_READ2_B32_:%[0-9]+]]:vreg_64 = DS_READ2_B32 [[COPY]], 254, 255, 0, implicit $m0, implicit $exec :: (load (s64), align 4, addrspace 3)
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_]]
-    ;
     ; GFX9-LABEL: name: load_local_s64_align4_from_1_gep_1016
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 [[COPY]], 254, 255, 0, implicit $exec :: (load (s64), align 4, addrspace 3)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_gfx9_]]
-    ;
     ; GFX10-LABEL: name: load_local_s64_align4_from_1_gep_1016
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1015,7 +954,6 @@ body: |
     ; GFX6-NEXT: $m0 = S_MOV_B32 -1
     ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(s64) = G_LOAD [[PTR_ADD]](p3) :: (load (s64), align 4, addrspace 3)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
-    ;
     ; GFX7-LABEL: name: load_local_s64_align4_from_1_gep_1020
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
@@ -1025,7 +963,6 @@ body: |
     ; GFX7-NEXT: $m0 = S_MOV_B32 -1
     ; GFX7-NEXT: [[DS_READ2_B32_:%[0-9]+]]:vreg_64 = DS_READ2_B32 [[V_ADD_CO_U32_e64_]], 0, 1, 0, implicit $m0, implicit $exec :: (load (s64), align 4, addrspace 3)
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_]]
-    ;
     ; GFX9-LABEL: name: load_local_s64_align4_from_1_gep_1020
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1034,7 +971,6 @@ body: |
     ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX9-NEXT: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 [[V_ADD_U32_e64_]], 0, 1, 0, implicit $exec :: (load (s64), align 4, addrspace 3)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_gfx9_]]
-    ;
     ; GFX10-LABEL: name: load_local_s64_align4_from_1_gep_1020
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-constant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-constant.mir
index ff1d3fe3796732..a63df136e003c3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-constant.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-constant.mir
@@ -6604,25 +6604,22 @@ body: |
     ; CI: liveins: $vgpr0_vgpr1
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
-    ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p4) :: (load (<2 x s32>), addrspace 4)
-    ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p4) :: (load (<2 x p3>), addrspace 4)
+    ; CI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ;
     ; VI-LABEL: name: test_load_constant_v2p3_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
-    ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p4) :: (load (<2 x s32>), addrspace 4)
-    ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p4) :: (load (<2 x p3>), addrspace 4)
+    ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ;
     ; GFX9-LABEL: name: test_load_constant_v2p3_align8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p4) :: (load (<2 x s32>), addrspace 4)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p4) :: (load (<2 x p3>), addrspace 4)
+    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     %0:_(p4) = COPY $vgpr0_vgpr1
     %1:_(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 8, addrspace 4)
     $vgpr0_vgpr1 = COPY %1
@@ -6638,25 +6635,22 @@ body: |
     ; CI: liveins: $vgpr0_vgpr1
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
-    ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p4) :: (load (<2 x s32>), align 4, addrspace 4)
-    ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p4) :: (load (<2 x p3>), align 4, addrspace 4)
+    ; CI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ;
     ; VI-LABEL: name: test_load_constant_v2p3_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
-    ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p4) :: (load (<2 x s32>), align 4, addrspace 4)
-    ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p4) :: (load (<2 x p3>), align 4, addrspace 4)
+    ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ;
     ; GFX9-LABEL: name: test_load_constant_v2p3_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p4) :: (load (<2 x s32>), align 4, addrspace 4)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p4) :: (load (<2 x p3>), align 4, addrspace 4)
+    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     %0:_(p4) = COPY $vgpr0_vgpr1
     %1:_(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 4, addrspace 4)
     $vgpr0_vgpr1 = COPY %1
@@ -6689,6 +6683,7 @@ body: |
     ; CI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; CI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
     ; CI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; CI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
     ; CI-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
     ; CI-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C4]](s64)
     ; CI-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p4) :: (load (s8) from unknown-address + 4, addrspace 4)
@@ -6704,9 +6699,9 @@ body: |
     ; CI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
     ; CI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
     ; CI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
-    ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>)
-    ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; CI-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32)
+    ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3)
+    ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
     ;
     ; VI-LABEL: name: test_load_constant_v2p3_align1
     ; VI: liveins: $vgpr0_vgpr1
@@ -6729,6 +6724,7 @@ body: |
     ; VI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; VI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
     ; VI-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
     ; VI-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C4]](s64)
     ; VI-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p4) :: (load (s8) from unknown-address + 4, addrspace 4)
@@ -6744,9 +6740,9 @@ body: |
     ; VI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
     ; VI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
     ; VI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
-    ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>)
-    ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; VI-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32)
+    ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3)
+    ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
     ;
     ; GFX9-LABEL: name: test_load_constant_v2p3_align1
     ; GFX9: liveins: $vgpr0_vgpr1
@@ -6769,6 +6765,7 @@ body: |
     ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
     ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
     ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
     ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C4]](s64)
     ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p4) :: (load (s8) from unknown-address + 4, addrspace 4)
@@ -6784,9 +6781,9 @@ body: |
     ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
     ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
     ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; GFX9-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3)
+    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
     %0:_(p4) = COPY $vgpr0_vgpr1
     %1:_(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 1, addrspace 4)
     $vgpr0_vgpr1 = COPY %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir
index 3b166660a84b75..b1d7d36f9912e7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir
@@ -15192,73 +15192,65 @@ body: |
     ; CI: liveins: $vgpr0_vgpr1
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 8)
+    ; CI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p0) :: (load (p3), align 8)
     ; CI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
     ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; CI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 4)
-    ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
-    ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>)
-    ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; CI-NEXT: [[LOAD1:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p0) :: (load (p3) from unknown-address + 4)
+    ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[LOAD]](p3), [[LOAD1]](p3)
+    ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
     ;
     ; VI-LABEL: name: test_load_flat_v2p3_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 8)
+    ; VI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p0) :: (load (p3), align 8)
     ; VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
     ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 4)
-    ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
-    ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>)
-    ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; VI-NEXT: [[LOAD1:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p0) :: (load (p3) from unknown-address + 4)
+    ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[LOAD]](p3), [[LOAD1]](p3)
+    ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
     ;
     ; GFX9PLUS-LABEL: name: test_load_flat_v2p3_align8
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>))
-    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>))
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_v2p3_align8
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>))
-    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>))
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ;
     ; GFX12-LABEL: name: test_load_flat_v2p3_align8
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>))
-    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>))
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2p3_align8
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>))
-    ; UNALIGNED_GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>))
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ;
     ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v2p3_align8
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>))
-    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>))
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v2p3_align8
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>))
-    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>))
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 8, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -15274,73 +15266,65 @@ body: |
     ; CI: liveins: $vgpr0_vgpr1
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; CI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p0) :: (load (p3))
     ; CI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
     ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; CI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 4)
-    ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
-    ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>)
-    ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; CI-NEXT: [[LOAD1:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p0) :: (load (p3) from unknown-address + 4)
+    ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[LOAD]](p3), [[LOAD1]](p3)
+    ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
     ;
     ; VI-LABEL: name: test_load_flat_v2p3_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; VI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p0) :: (load (p3))
     ; VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
     ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 4)
-    ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
-    ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>)
-    ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; VI-NEXT: [[LOAD1:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p0) :: (load (p3) from unknown-address + 4)
+    ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[LOAD]](p3), [[LOAD1]](p3)
+    ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
     ;
     ; GFX9PLUS-LABEL: name: test_load_flat_v2p3_align4
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>), align 4)
-    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>), align 4)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_v2p3_align4
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>), align 4)
-    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>), align 4)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ;
     ; GFX12-LABEL: name: test_load_flat_v2p3_align4
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>), align 4)
-    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>), align 4)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2p3_align4
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX9PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>), align 4)
-    ; UNALIGNED_GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>), align 4)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ;
     ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v2p3_align4
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>), align 4)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>), align 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v2p3_align4
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>), align 4)
-    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>), align 4)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 4, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -15373,6 +15357,7 @@ body: |
     ; CI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; CI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
     ; CI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; CI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
     ; CI-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
     ; CI-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
     ; CI-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
@@ -15388,9 +15373,9 @@ body: |
     ; CI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
     ; CI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
     ; CI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
-    ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>)
-    ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; CI-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32)
+    ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3)
+    ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
     ;
     ; VI-LABEL: name: test_load_flat_v2p3_align1
     ; VI: liveins: $vgpr0_vgpr1
@@ -15413,6 +15398,7 @@ body: |
     ; VI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; VI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
     ; VI-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
     ; VI-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
     ; VI-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
@@ -15428,9 +15414,9 @@ body: |
     ; VI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
     ; VI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
     ; VI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
-    ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>)
-    ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; VI-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32)
+    ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3)
+    ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
     ;
     ; GFX9PLUS-LABEL: name: test_load_flat_v2p3_align1
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -15453,6 +15439,7 @@ body: |
     ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
     ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
     ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
     ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
     ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
@@ -15468,9 +15455,9 @@ body: |
     ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
     ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
     ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
-    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>)
-    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; GFX9PLUS-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_v2p3_align1
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
@@ -15493,6 +15480,7 @@ body: |
     ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
     ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
     ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
     ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
     ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
@@ -15508,9 +15496,9 @@ body: |
     ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
     ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
     ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
-    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>)
-    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; GFX11PLUS-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32)
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
     ;
     ; GFX12-LABEL: name: test_load_flat_v2p3_align1
     ; GFX12: liveins: $vgpr0_vgpr1
@@ -15533,6 +15521,7 @@ body: |
     ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
     ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
     ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
     ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
     ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
@@ -15548,9 +15537,9 @@ body: |
     ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
     ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
     ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
-    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; GFX12-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32)
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2p3_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -15573,6 +15562,7 @@ body: |
     ; UNALIGNED_GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; UNALIGNED_GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
     ; UNALIGNED_GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
     ; UNALIGNED_GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
     ; UNALIGNED_GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
     ; UNALIGNED_GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
@@ -15588,9 +15578,9 @@ body: |
     ; UNALIGNED_GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
     ; UNALIGNED_GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
     ; UNALIGNED_GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; UNALIGNED_GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
-    ; UNALIGNED_GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>)
-    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32)
+    ; UNALIGNED_GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3)
+    ; UNALIGNED_GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
     ;
     ; UNALIGNED_GFX11PLUS-LABEL: name: test_load_flat_v2p3_align1
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
@@ -15613,6 +15603,7 @@ body: |
     ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
     ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
     ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
     ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
     ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
@@ -15628,9 +15619,9 @@ body: |
     ; UNALIGNED_GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
     ; UNALIGNED_GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
     ; UNALIGNED_GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>)
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v2p3_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
@@ -15653,6 +15644,7 @@ body: |
     ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
     ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
     ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
     ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
     ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
@@ -15668,9 +15660,9 @@ body: |
     ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
     ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
     ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; UNALIGNED_GFX12-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 1, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir
index f384114ee4cde7..d6acc6ecdfc660 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir
@@ -13448,49 +13448,43 @@ body: |
     ; SI: liveins: $vgpr0_vgpr1
     ; SI-NEXT: {{  $}}
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
-    ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), addrspace 1)
-    ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1)
+    ; SI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ;
     ; CI-HSA-LABEL: name: test_load_global_v2p3_align8
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
-    ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), addrspace 1)
-    ; CI-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1)
+    ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ;
     ; CI-MESA-LABEL: name: test_load_global_v2p3_align8
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
-    ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), addrspace 1)
-    ; CI-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1)
+    ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ;
     ; VI-LABEL: name: test_load_global_v2p3_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
-    ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), addrspace 1)
-    ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1)
+    ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ;
     ; GFX9-HSA-LABEL: name: test_load_global_v2p3_align8
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
-    ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), addrspace 1)
-    ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1)
+    ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ;
     ; GFX9-MESA-LABEL: name: test_load_global_v2p3_align8
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
     ; GFX9-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
-    ; GFX9-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), addrspace 1)
-    ; GFX9-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; GFX9-MESA-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; GFX9-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1)
+    ; GFX9-MESA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     %0:_(p1) = COPY $vgpr0_vgpr1
     %1:_(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 8, addrspace 1)
     $vgpr0_vgpr1 = COPY %1
@@ -13506,49 +13500,43 @@ body: |
     ; SI: liveins: $vgpr0_vgpr1
     ; SI-NEXT: {{  $}}
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
-    ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), align 4, addrspace 1)
-    ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), align 4, addrspace 1)
+    ; SI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ;
     ; CI-HSA-LABEL: name: test_load_global_v2p3_align4
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
-    ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), align 4, addrspace 1)
-    ; CI-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), align 4, addrspace 1)
+    ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ;
     ; CI-MESA-LABEL: name: test_load_global_v2p3_align4
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
-    ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), align 4, addrspace 1)
-    ; CI-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), align 4, addrspace 1)
+    ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ;
     ; VI-LABEL: name: test_load_global_v2p3_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
-    ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), align 4, addrspace 1)
-    ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), align 4, addrspace 1)
+    ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ;
     ; GFX9-HSA-LABEL: name: test_load_global_v2p3_align4
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
-    ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), align 4, addrspace 1)
-    ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), align 4, addrspace 1)
+    ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ;
     ; GFX9-MESA-LABEL: name: test_load_global_v2p3_align4
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
     ; GFX9-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
-    ; GFX9-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), align 4, addrspace 1)
-    ; GFX9-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; GFX9-MESA-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; GFX9-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), align 4, addrspace 1)
+    ; GFX9-MESA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     %0:_(p1) = COPY $vgpr0_vgpr1
     %1:_(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 4, addrspace 1)
     $vgpr0_vgpr1 = COPY %1
@@ -13581,6 +13569,7 @@ body: |
     ; SI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
     ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; SI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
     ; SI-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
     ; SI-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C4]](s64)
     ; SI-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p1) :: (load (s8) from unknown-address + 4, addrspace 1)
@@ -13596,17 +13585,16 @@ body: |
     ; SI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
     ; SI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
     ; SI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
-    ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>)
-    ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; SI-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32)
+    ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3)
+    ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
     ;
     ; CI-HSA-LABEL: name: test_load_global_v2p3_align1
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
-    ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), align 1, addrspace 1)
-    ; CI-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), align 1, addrspace 1)
+    ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ;
     ; CI-MESA-LABEL: name: test_load_global_v2p3_align1
     ; CI-MESA: liveins: $vgpr0_vgpr1
@@ -13629,6 +13617,7 @@ body: |
     ; CI-MESA-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; CI-MESA-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
     ; CI-MESA-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; CI-MESA-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
     ; CI-MESA-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
     ; CI-MESA-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C4]](s64)
     ; CI-MESA-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p1) :: (load (s8) from unknown-address + 4, addrspace 1)
@@ -13644,9 +13633,9 @@ body: |
     ; CI-MESA-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
     ; CI-MESA-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
     ; CI-MESA-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; CI-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
-    ; CI-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>)
-    ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; CI-MESA-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32)
+    ; CI-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3)
+    ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
     ;
     ; VI-LABEL: name: test_load_global_v2p3_align1
     ; VI: liveins: $vgpr0_vgpr1
@@ -13669,6 +13658,7 @@ body: |
     ; VI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; VI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
     ; VI-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
     ; VI-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C4]](s64)
     ; VI-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p1) :: (load (s8) from unknown-address + 4, addrspace 1)
@@ -13684,17 +13674,16 @@ body: |
     ; VI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
     ; VI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
     ; VI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
-    ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>)
-    ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; VI-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32)
+    ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3)
+    ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
     ;
     ; GFX9-HSA-LABEL: name: test_load_global_v2p3_align1
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
-    ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), align 1, addrspace 1)
-    ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), align 1, addrspace 1)
+    ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ;
     ; GFX9-MESA-LABEL: name: test_load_global_v2p3_align1
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
@@ -13717,6 +13706,7 @@ body: |
     ; GFX9-MESA-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX9-MESA-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
     ; GFX9-MESA-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX9-MESA-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
     ; GFX9-MESA-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
     ; GFX9-MESA-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C4]](s64)
     ; GFX9-MESA-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p1) :: (load (s8) from unknown-address + 4, addrspace 1)
@@ -13732,9 +13722,9 @@ body: |
     ; GFX9-MESA-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
     ; GFX9-MESA-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
     ; GFX9-MESA-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
-    ; GFX9-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>)
-    ; GFX9-MESA-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; GFX9-MESA-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32)
+    ; GFX9-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3)
+    ; GFX9-MESA-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
     %0:_(p1) = COPY $vgpr0_vgpr1
     %1:_(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 1, addrspace 1)
     $vgpr0_vgpr1 = COPY %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir
index 1608234d6b2bc5..1249de647bb759 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir
@@ -14595,81 +14595,71 @@ body: |
     ; SI: liveins: $vgpr0
     ; SI-NEXT: {{  $}}
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-    ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), addrspace 3)
-    ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load (<2 x p3>), addrspace 3)
+    ; SI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ;
     ; CI-LABEL: name: test_load_local_v2p3_align8
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-    ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), addrspace 3)
-    ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load (<2 x p3>), addrspace 3)
+    ; CI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ;
     ; CI-DS128-LABEL: name: test_load_local_v2p3_align8
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
     ; CI-DS128-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-    ; CI-DS128-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), addrspace 3)
-    ; CI-DS128-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; CI-DS128-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; CI-DS128-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load (<2 x p3>), addrspace 3)
+    ; CI-DS128-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ;
     ; VI-LABEL: name: test_load_local_v2p3_align8
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-    ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), addrspace 3)
-    ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load (<2 x p3>), addrspace 3)
+    ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ;
     ; GFX9-LABEL: name: test_load_local_v2p3_align8
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), addrspace 3)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load (<2 x p3>), addrspace 3)
+    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_v2p3_align8
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-    ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), addrspace 3)
-    ; GFX9-UNALIGNED-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load (<2 x p3>), addrspace 3)
+    ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ;
     ; GFX10-LABEL: name: test_load_local_v2p3_align8
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), addrspace 3)
-    ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load (<2 x p3>), addrspace 3)
+    ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_v2p3_align8
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
     ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-    ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), addrspace 3)
-    ; GFX10-UNALIGNED-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load (<2 x p3>), addrspace 3)
+    ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ;
     ; GFX11-LABEL: name: test_load_local_v2p3_align8
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-    ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), addrspace 3)
-    ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load (<2 x p3>), addrspace 3)
+    ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_v2p3_align8
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
     ; GFX11-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-    ; GFX11-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), addrspace 3)
-    ; GFX11-UNALIGNED-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; GFX11-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; GFX11-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load (<2 x p3>), addrspace 3)
+    ; GFX11-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     %0:_(p3) = COPY $vgpr0
     %1:_(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 8, addrspace 3)
     $vgpr0_vgpr1 = COPY %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir
index 472cbe559e56f4..741f878c86f8b6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir
@@ -16526,117 +16526,106 @@ body: |
     ; SI: liveins: $vgpr0
     ; SI-NEXT: {{  $}}
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5)
+    ; SI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 8, addrspace 5)
     ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; SI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
-    ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
-    ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>)
-    ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; SI-NEXT: [[LOAD1:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p5) :: (load (p3) from unknown-address + 4, addrspace 5)
+    ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[LOAD]](p3), [[LOAD1]](p3)
+    ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
     ;
     ; CI-LABEL: name: test_load_private_v2p3_align8
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5)
+    ; CI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 8, addrspace 5)
     ; CI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; CI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
-    ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
-    ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>)
-    ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; CI-NEXT: [[LOAD1:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p5) :: (load (p3) from unknown-address + 4, addrspace 5)
+    ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[LOAD]](p3), [[LOAD1]](p3)
+    ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
     ;
     ; VI-LABEL: name: test_load_private_v2p3_align8
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5)
+    ; VI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 8, addrspace 5)
     ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
-    ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
-    ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>)
-    ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; VI-NEXT: [[LOAD1:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p5) :: (load (p3) from unknown-address + 4, addrspace 5)
+    ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[LOAD]](p3), [[LOAD1]](p3)
+    ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
     ;
     ; GFX9-LABEL: name: test_load_private_v2p3_align8
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5)
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 8, addrspace 5)
     ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p5) :: (load (p3) from unknown-address + 4, addrspace 5)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[LOAD]](p3), [[LOAD1]](p3)
+    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
     ;
     ; GFX10-LABEL: name: test_load_private_v2p3_align8
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5)
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 8, addrspace 5)
     ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
-    ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>)
-    ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p5) :: (load (p3) from unknown-address + 4, addrspace 5)
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[LOAD]](p3), [[LOAD1]](p3)
+    ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
     ;
     ; GFX11-LABEL: name: test_load_private_v2p3_align8
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), addrspace 5)
-    ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p5) :: (load (<2 x p3>), addrspace 5)
+    ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ;
     ; GFX12-LABEL: name: test_load_private_v2p3_align8
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), addrspace 5)
-    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p5) :: (load (<2 x p3>), addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_v2p3_align8
     ; UNALIGNED_GFX9: liveins: $vgpr0
     ; UNALIGNED_GFX9-NEXT: {{  $}}
     ; UNALIGNED_GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 8, addrspace 5)
     ; UNALIGNED_GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; UNALIGNED_GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
-    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
-    ; UNALIGNED_GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>)
-    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; UNALIGNED_GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p5) :: (load (p3) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[LOAD]](p3), [[LOAD1]](p3)
+    ; UNALIGNED_GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
     ;
     ; UNALIGNED_GFX10-LABEL: name: test_load_private_v2p3_align8
     ; UNALIGNED_GFX10: liveins: $vgpr0
     ; UNALIGNED_GFX10-NEXT: {{  $}}
     ; UNALIGNED_GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 8, addrspace 5)
     ; UNALIGNED_GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; UNALIGNED_GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
-    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
-    ; UNALIGNED_GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>)
-    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; UNALIGNED_GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p5) :: (load (p3) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[LOAD]](p3), [[LOAD1]](p3)
+    ; UNALIGNED_GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
     ;
     ; UNALIGNED_GFX11-LABEL: name: test_load_private_v2p3_align8
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p5) :: (load (<2 x p3>), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2p3_align8
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p5) :: (load (<2 x p3>), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 8, addrspace 5)
     $vgpr0_vgpr1 = COPY %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir
index b9c72d39ed45b6..f2a88a21a286ef 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir
@@ -2310,9 +2310,9 @@ body: |
     ; SI-NEXT: {{  $}}
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3
-    ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY1]](<2 x p3>)
-    ; SI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>)
-    ; SI-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+    ; SI-NEXT: [[UV:%[0-9]+]]:_(p3), [[UV1:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[COPY1]](<2 x p3>)
+    ; SI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV]](p3)
+    ; SI-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[PTRTOINT]](s32)
     ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY2]], [[C]](s32)
     ; SI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
@@ -2332,7 +2332,8 @@ body: |
     ; SI-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD2]](p1) :: (store (s8) into unknown-address + 3, addrspace 1)
     ; SI-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
     ; SI-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C5]](s64)
-    ; SI-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+    ; SI-NEXT: [[PTRTOINT1:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV1]](p3)
+    ; SI-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[PTRTOINT1]](s32)
     ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY4]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s64)
     ; SI-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
@@ -2352,17 +2353,16 @@ body: |
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3
-    ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY1]](<2 x p3>)
-    ; CI-NEXT: G_STORE [[BITCAST]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), align 1, addrspace 1)
+    ; CI-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), align 1, addrspace 1)
     ;
     ; VI-LABEL: name: test_store_global_v2p3_align1
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3
-    ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY1]](<2 x p3>)
-    ; VI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>)
-    ; VI-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+    ; VI-NEXT: [[UV:%[0-9]+]]:_(p3), [[UV1:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[COPY1]](<2 x p3>)
+    ; VI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV]](p3)
+    ; VI-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[PTRTOINT]](s32)
     ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY2]], [[C]](s32)
     ; VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
@@ -2383,7 +2383,8 @@ body: |
     ; VI-NEXT: G_STORE [[ANYEXT1]](s32), [[PTR_ADD2]](p1) :: (store (s8) into unknown-address + 3, addrspace 1)
     ; VI-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
     ; VI-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; VI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+    ; VI-NEXT: [[PTRTOINT1:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV1]](p3)
+    ; VI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[PTRTOINT1]](s32)
     ; VI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY3]], [[C]](s32)
     ; VI-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s64)
     ; VI-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32)
@@ -2404,8 +2405,7 @@ body: |
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY1]](<2 x p3>)
-    ; GFX9-NEXT: G_STORE [[BITCAST]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), align 1, addrspace 1)
+    ; GFX9-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), align 1, addrspace 1)
     %0:_(p1) = COPY $vgpr0_vgpr1
     %1:_(<2 x p3>) = COPY $vgpr2_vgpr3
     G_STORE %1, %0 :: (store (<2 x p3>), align 1, addrspace 1)
@@ -2422,9 +2422,9 @@ body: |
     ; SI-NEXT: {{  $}}
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3
-    ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY1]](<2 x p3>)
-    ; SI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>)
-    ; SI-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+    ; SI-NEXT: [[UV:%[0-9]+]]:_(p3), [[UV1:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[COPY1]](<2 x p3>)
+    ; SI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV]](p3)
+    ; SI-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[PTRTOINT]](s32)
     ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY2]], [[C]](s32)
     ; SI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
@@ -2433,7 +2433,8 @@ body: |
     ; SI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s16) into unknown-address + 2, addrspace 1)
     ; SI-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
     ; SI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+    ; SI-NEXT: [[PTRTOINT1:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV1]](p3)
+    ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[PTRTOINT1]](s32)
     ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY3]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD1]], [[C1]](s64)
     ; SI-NEXT: G_STORE [[COPY3]](s32), [[PTR_ADD1]](p1) :: (store (s16) into unknown-address + 4, addrspace 1)
@@ -2444,17 +2445,16 @@ body: |
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3
-    ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY1]](<2 x p3>)
-    ; CI-NEXT: G_STORE [[BITCAST]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), align 2, addrspace 1)
+    ; CI-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), align 2, addrspace 1)
     ;
     ; VI-LABEL: name: test_store_global_v2p3_align2
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3
-    ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY1]](<2 x p3>)
-    ; VI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<2 x s32>)
-    ; VI-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+    ; VI-NEXT: [[UV:%[0-9]+]]:_(p3), [[UV1:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[COPY1]](<2 x p3>)
+    ; VI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV]](p3)
+    ; VI-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[PTRTOINT]](s32)
     ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY2]], [[C]](s32)
     ; VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
@@ -2463,7 +2463,8 @@ body: |
     ; VI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s16) into unknown-address + 2, addrspace 1)
     ; VI-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
     ; VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; VI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+    ; VI-NEXT: [[PTRTOINT1:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV1]](p3)
+    ; VI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[PTRTOINT1]](s32)
     ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY3]], [[C]](s32)
     ; VI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD1]], [[C1]](s64)
     ; VI-NEXT: G_STORE [[COPY3]](s32), [[PTR_ADD1]](p1) :: (store (s16) into unknown-address + 4, addrspace 1)
@@ -2474,8 +2475,7 @@ body: |
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY1]](<2 x p3>)
-    ; GFX9-NEXT: G_STORE [[BITCAST]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), align 2, addrspace 1)
+    ; GFX9-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), align 2, addrspace 1)
     %0:_(p1) = COPY $vgpr0_vgpr1
     %1:_(<2 x p3>) = COPY $vgpr2_vgpr3
     G_STORE %1, %0 :: (store (<2 x p3>), align 2, addrspace 1)
@@ -2492,32 +2492,28 @@ body: |
     ; SI-NEXT: {{  $}}
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3
-    ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY1]](<2 x p3>)
-    ; SI-NEXT: G_STORE [[BITCAST]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), align 4, addrspace 1)
+    ; SI-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), align 4, addrspace 1)
     ;
     ; CI-LABEL: name: test_store_global_v2p3_align4
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3
-    ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY1]](<2 x p3>)
-    ; CI-NEXT: G_STORE [[BITCAST]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), align 4, addrspace 1)
+    ; CI-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), align 4, addrspace 1)
     ;
     ; VI-LABEL: name: test_store_global_v2p3_align4
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3
-    ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY1]](<2 x p3>)
-    ; VI-NEXT: G_STORE [[BITCAST]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), align 4, addrspace 1)
+    ; VI-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), align 4, addrspace 1)
     ;
     ; GFX9-LABEL: name: test_store_global_v2p3_align4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY1]](<2 x p3>)
-    ; GFX9-NEXT: G_STORE [[BITCAST]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), align 4, addrspace 1)
+    ; GFX9-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), align 4, addrspace 1)
     %0:_(p1) = COPY $vgpr0_vgpr1
     %1:_(<2 x p3>) = COPY $vgpr2_vgpr3
     G_STORE %1, %0 :: (store (<2 x p3>), align 4, addrspace 1)
@@ -2534,32 +2530,28 @@ body: |
     ; SI-NEXT: {{  $}}
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3
-    ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY1]](<2 x p3>)
-    ; SI-NEXT: G_STORE [[BITCAST]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), addrspace 1)
+    ; SI-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), addrspace 1)
     ;
     ; CI-LABEL: name: test_store_global_v2p3_align8
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3
-    ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY1]](<2 x p3>)
-    ; CI-NEXT: G_STORE [[BITCAST]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), addrspace 1)
+    ; CI-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), addrspace 1)
     ;
     ; VI-LABEL: name: test_store_global_v2p3_align8
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3
-    ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY1]](<2 x p3>)
-    ; VI-NEXT: G_STORE [[BITCAST]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), addrspace 1)
+    ; VI-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), addrspace 1)
     ;
     ; GFX9-LABEL: name: test_store_global_v2p3_align8
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY1]](<2 x p3>)
-    ; GFX9-NEXT: G_STORE [[BITCAST]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), addrspace 1)
+    ; GFX9-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), addrspace 1)
     %0:_(p1) = COPY $vgpr0_vgpr1
     %1:_(<2 x p3>) = COPY $vgpr2_vgpr3
     G_STORE %1, %0 :: (store (<2 x p3>), align 8, addrspace 1)
@@ -2576,32 +2568,28 @@ body: |
     ; SI-NEXT: {{  $}}
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3
-    ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY1]](<2 x p3>)
-    ; SI-NEXT: G_STORE [[BITCAST]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), align 16, addrspace 1)
+    ; SI-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), align 16, addrspace 1)
     ;
     ; CI-LABEL: name: test_store_global_v2p3_align16
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3
-    ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY1]](<2 x p3>)
-    ; CI-NEXT: G_STORE [[BITCAST]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), align 16, addrspace 1)
+    ; CI-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), align 16, addrspace 1)
     ;
     ; VI-LABEL: name: test_store_global_v2p3_align16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3
-    ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY1]](<2 x p3>)
-    ; VI-NEXT: G_STORE [[BITCAST]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), align 16, addrspace 1)
+    ; VI-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), align 16, addrspace 1)
     ;
     ; GFX9-LABEL: name: test_store_global_v2p3_align16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY1]](<2 x p3>)
-    ; GFX9-NEXT: G_STORE [[BITCAST]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), align 16, addrspace 1)
+    ; GFX9-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), align 16, addrspace 1)
     %0:_(p1) = COPY $vgpr0_vgpr1
     %1:_(<2 x p3>) = COPY $vgpr2_vgpr3
     G_STORE %1, %0 :: (store (<2 x p3>), align 16, addrspace 1)

From 8306114ed2313a7febdb0d0d0c31df357ed53fdd Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 10 Oct 2024 12:49:35 +0100
Subject: [PATCH 002/177] [clang][x86] Add constexpr support for
 _mm_cvtsi32_ss/_mm_cvt_si2ss/_mm_cvtsi64_ss SSE1 intrinsics

Followup to #111001
---
 clang/lib/Headers/xmmintrin.h         | 15 ++++++---------
 clang/test/CodeGen/X86/sse-builtins.c |  9 +++++++++
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h
index 2aa688adefc25a..20e66d190113a3 100644
--- a/clang/lib/Headers/xmmintrin.h
+++ b/clang/lib/Headers/xmmintrin.h
@@ -1618,9 +1618,8 @@ _mm_cvtt_ps2pi(__m128 __a)
 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
 ///    converted value of the second operand. The upper 96 bits are copied from
 ///    the upper 96 bits of the first operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cvtsi32_ss(__m128 __a, int __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtsi32_ss(__m128 __a,
+                                                                     int __b) {
   __a[0] = __b;
   return __a;
 }
@@ -1641,9 +1640,8 @@ _mm_cvtsi32_ss(__m128 __a, int __b)
 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
 ///    converted value of the second operand. The upper 96 bits are copied from
 ///    the upper 96 bits of the first operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cvt_si2ss(__m128 __a, int __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvt_si2ss(__m128 __a,
+                                                                    int __b) {
   return _mm_cvtsi32_ss(__a, __b);
 }
 
@@ -1665,9 +1663,8 @@ _mm_cvt_si2ss(__m128 __a, int __b)
 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
 ///    converted value of the second operand. The upper 96 bits are copied from
 ///    the upper 96 bits of the first operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cvtsi64_ss(__m128 __a, long long __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_cvtsi64_ss(__m128 __a, long long __b) {
   __a[0] = __b;
   return __a;
 }
diff --git a/clang/test/CodeGen/X86/sse-builtins.c b/clang/test/CodeGen/X86/sse-builtins.c
index 932d6f36b09b66..391e049a6ae3ef 100644
--- a/clang/test/CodeGen/X86/sse-builtins.c
+++ b/clang/test/CodeGen/X86/sse-builtins.c
@@ -948,6 +948,15 @@ void test_constexpr() {
   constexpr __m128 v_mm_movelh_ps = _mm_movelh_ps(k1, k2);
   static_assert(v_mm_movelh_ps[0] == +1.0f && v_mm_movelh_ps[1] == +0.0f && v_mm_movelh_ps[2] == +8.0f && v_mm_movelh_ps[3] == +4.0f);
 
+  constexpr __m128 v_mm_cvtsi32_ss = _mm_cvtsi32_ss(k1, 42);
+  static_assert(v_mm_cvtsi32_ss[0] == 42.0f && v_mm_cvtsi32_ss[1] == +0.0f && v_mm_cvtsi32_ss[2] == +2.0f && v_mm_cvtsi32_ss[3] == +4.0f);
+
+  constexpr __m128 v_mm_cvt_si2ss = _mm_cvt_si2ss(k2, -99);
+  static_assert(v_mm_cvt_si2ss[0] == -99.0f && v_mm_cvt_si2ss[1] == +4.0f && v_mm_cvt_si2ss[2] == +2.0f && v_mm_cvt_si2ss[3] == +1.0f);
+
+  constexpr __m128 v_mm_cvtsi64_ss = _mm_cvtsi64_ss(k3, 555);
+  static_assert(v_mm_cvtsi64_ss[0] == 555.0f && v_mm_cvtsi64_ss[1] == -5.0f && v_mm_cvtsi64_ss[2] == +6.0f && v_mm_cvtsi64_ss[3] == +7.0f);
+
   static_assert(_mm_cvtss_f32(k2) == +8.0f);
 }
 

From ea2b8976e69ad70220f71abf28d6781dc1e41fab Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Thu, 10 Oct 2024 07:57:45 -0400
Subject: [PATCH 003/177] [libc++] Remove nonexistent directory from
 check-generated-output (#111746)

The libcxx/benchmarks directory was moved to libcxx/test/benchmarks,
which is already checked by that grep command.
---
 libcxx/utils/ci/run-buildbot | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/utils/ci/run-buildbot b/libcxx/utils/ci/run-buildbot
index 536d6270361307..0ce1def5f37224 100755
--- a/libcxx/utils/ci/run-buildbot
+++ b/libcxx/utils/ci/run-buildbot
@@ -238,7 +238,7 @@ check-generated-output)
     # Reject patches that introduce non-ASCII characters or hard tabs.
     # Depends on LC_COLLATE set at the top of this script.
     set -x
-    ! grep -rn '[^ -~]' libcxx/include libcxx/src libcxx/test libcxx/benchmarks \
+    ! grep -rn '[^ -~]' libcxx/include libcxx/src libcxx/test \
            --exclude '*.dat' \
            --exclude '*unicode*.cpp' \
            --exclude '*print*.sh.cpp' \

From b94c763b7c123995ae31a6ce44223e89ef9f226a Mon Sep 17 00:00:00 2001
From: Petr Hosek <phosek@google.com>
Date: Thu, 10 Oct 2024 04:59:34 -0700
Subject: [PATCH 004/177] [Fuchsia][CMake] Set output name for libc++ shared
 library (#111791)

This is a dependency of #80007.
---
 clang/cmake/caches/Fuchsia-stage2.cmake | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/clang/cmake/caches/Fuchsia-stage2.cmake b/clang/cmake/caches/Fuchsia-stage2.cmake
index 26ae30c71b4df3..5af98c7b3b3fba 100644
--- a/clang/cmake/caches/Fuchsia-stage2.cmake
+++ b/clang/cmake/caches/Fuchsia-stage2.cmake
@@ -345,6 +345,7 @@ foreach(target armv6m-none-eabi;armv7m-none-eabi;armv8m.main-none-eabi)
   set(RUNTIMES_${target}_LIBCXX_CXX_ABI none CACHE STRING "")
   set(RUNTIMES_${target}_LIBCXX_ENABLE_SHARED OFF CACHE BOOL "")
   set(RUNTIMES_${target}_LIBCXX_ENABLE_STATIC ON CACHE BOOL "")
+  set(RUNTIMES_${target}_LIBCXX_SHARED_OUTPUT_NAME "c++-shared" CACHE STRING "")
   set(RUNTIMES_${target}_LIBCXX_LIBC "llvm-libc" CACHE STRING "")
   set(RUNTIMES_${target}_LIBCXX_ENABLE_FILESYSTEM OFF CACHE BOOL "")
   set(RUNTIMES_${target}_LIBCXX_ENABLE_RANDOM_DEVICE OFF CACHE BOOL "")
@@ -396,6 +397,7 @@ foreach(target riscv32-unknown-elf)
   set(RUNTIMES_${target}_LIBCXX_CXX_ABI none CACHE STRING "")
   set(RUNTIMES_${target}_LIBCXX_ENABLE_SHARED OFF CACHE BOOL "")
   set(RUNTIMES_${target}_LIBCXX_ENABLE_STATIC ON CACHE BOOL "")
+  set(RUNTIMES_${target}_LIBCXX_SHARED_OUTPUT_NAME "c++-shared" CACHE STRING "")
   set(RUNTIMES_${target}_LIBCXX_LIBC "llvm-libc" CACHE STRING "")
   set(RUNTIMES_${target}_LIBCXX_ENABLE_FILESYSTEM OFF CACHE BOOL "")
   set(RUNTIMES_${target}_LIBCXX_ENABLE_RANDOM_DEVICE OFF CACHE BOOL "")

From 917ada35cd937ad4104dff89c48398bd796ba6b7 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Thu, 10 Oct 2024 08:00:01 -0400
Subject: [PATCH 005/177] [runtimes] Always define cxx_shared, cxx_static &
 other targets (#80007)

This patch always defines the cxx_shared, cxx_static & other top-level
targets. However, they are marked as EXCLUDE_FROM_ALL when we don't want
to build them. Simply declaring the targets should be of no harm, and it
allows other projects to mention these targets regardless of whether
they end up being built or not.

This patch basically moves the definition of e.g. cxx_shared out of the
`if (LIBCXX_ENABLE_SHARED)` and instead marks it as EXCLUDE_FROM_ALL
conditionally on whether LIBCXX_ENABLE_SHARED is passed. It then does
the same for libunwind and libc++abi targets. I purposefully avoided to
reformat the files (which now has inconsistent indentation) because I
wanted to keep the diff minimal, and I know this is an area of the code
where folks may have downstream diffs. I will re-indent the code
separately once this patch lands.

This is a reapplication of 79ee0342dbf0, which was reverted in
a3539090884c because it broke the TSAN and the Fuchsia builds.

Resolves #77654

Differential Revision: https://reviews.llvm.org/D134221
---
 libcxx/cmake/caches/AIX.cmake             |  7 +++++++
 libcxx/cmake/caches/Armv7M-picolibc.cmake | 11 +++++++++++
 libcxx/src/CMakeLists.txt                 | 22 +++++++++++-----------
 libcxxabi/src/CMakeLists.txt              | 20 +++++++++++---------
 libunwind/src/CMakeLists.txt              | 18 ++++++++++--------
 5 files changed, 50 insertions(+), 28 deletions(-)

diff --git a/libcxx/cmake/caches/AIX.cmake b/libcxx/cmake/caches/AIX.cmake
index 4ec78f9bbd5923..036fdfdae60725 100644
--- a/libcxx/cmake/caches/AIX.cmake
+++ b/libcxx/cmake/caches/AIX.cmake
@@ -16,3 +16,10 @@ set(LIBCXX_CXX_ABI libcxxabi CACHE STRING "")
 set(LIBUNWIND_ENABLE_SHARED ON CACHE BOOL "")
 set(LIBUNWIND_ENABLE_STATIC OFF CACHE BOOL "")
 set(LIBCXX_ABI_DEFINES "_LIBCPP_ABI_IOS_ALLOW_ARBITRARY_FILL_VALUE" CACHE STRING "")
+
+# On AIX, both shared and static libraries are archived. As a result, both the static and the shared targets end
+# up with a `.a` suffix, which conflict. To workaround that, we set a different output name for the static
+# libraries, which we never actually build anyway. For more information, see https://gitlab.kitware.com/cmake/cmake/-/issues/19494.
+set(LIBCXX_STATIC_OUTPUT_NAME "c++-static" CACHE STRING "")
+set(LIBCXXABI_STATIC_OUTPUT_NAME "c++abi-static" CACHE STRING "")
+set(LIBUNWIND_STATIC_OUTPUT_NAME "unwind-static" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Armv7M-picolibc.cmake b/libcxx/cmake/caches/Armv7M-picolibc.cmake
index b5f9089308d22e..0f8189b457285e 100644
--- a/libcxx/cmake/caches/Armv7M-picolibc.cmake
+++ b/libcxx/cmake/caches/Armv7M-picolibc.cmake
@@ -39,3 +39,14 @@ set(LIBUNWIND_IS_BAREMETAL ON CACHE BOOL "")
 set(LIBUNWIND_REMEMBER_HEAP_ALLOC ON CACHE BOOL "")
 set(LIBUNWIND_USE_COMPILER_RT ON CACHE BOOL "")
 find_program(QEMU_SYSTEM_ARM qemu-system-arm REQUIRED)
+
+# On embedded platforms that don't support shared library targets, CMake implicitly changes shared
+# library targets to be static library targets. This results in duplicate definitions of the static
+# library targets even though we might not ever build the shared library target, which breaks the
+# build. To work around this, we change the output name of the  shared library target so that it
+# can't conflict with the static library target.
+#
+# This is tracked by https://gitlab.kitware.com/cmake/cmake/-/issues/25759.
+set(LIBCXX_SHARED_OUTPUT_NAME "c++-shared" CACHE STRING "")
+set(LIBCXXABI_SHARED_OUTPUT_NAME "c++abi-shared" CACHE STRING "")
+set(LIBUNWIND_SHARED_OUTPUT_NAME "unwind-shared" CACHE STRING "")
diff --git a/libcxx/src/CMakeLists.txt b/libcxx/src/CMakeLists.txt
index b187677ff2db52..9f31822065be9d 100644
--- a/libcxx/src/CMakeLists.txt
+++ b/libcxx/src/CMakeLists.txt
@@ -143,10 +143,6 @@ if (LIBCXX_ENABLE_NEW_DELETE_DEFINITIONS)
     )
 endif()
 
-if(NOT LIBCXX_INSTALL_LIBRARY)
-  set(exclude_from_all EXCLUDE_FROM_ALL)
-endif()
-
 if (APPLE AND LLVM_USE_SANITIZER)
   if (("${LLVM_USE_SANITIZER}" STREQUAL "Address") OR
       ("${LLVM_USE_SANITIZER}" STREQUAL "Address;Undefined") OR
@@ -177,13 +173,13 @@ split_list(LIBCXX_COMPILE_FLAGS)
 split_list(LIBCXX_LINK_FLAGS)
 
 # Build the shared library.
-if (LIBCXX_ENABLE_SHARED)
-  add_library(cxx_shared SHARED ${exclude_from_all} ${LIBCXX_SOURCES} ${LIBCXX_HEADERS})
+  add_library(cxx_shared SHARED ${LIBCXX_SOURCES} ${LIBCXX_HEADERS})
   target_include_directories(cxx_shared PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
   target_link_libraries(cxx_shared PUBLIC cxx-headers libcxx-libc-shared
                                    PRIVATE ${LIBCXX_LIBRARIES})
   set_target_properties(cxx_shared
     PROPERTIES
+      EXCLUDE_FROM_ALL "$<IF:$<BOOL:${LIBCXX_ENABLE_SHARED}>,FALSE,TRUE>"
       COMPILE_FLAGS "${LIBCXX_COMPILE_FLAGS}"
       LINK_FLAGS    "${LIBCXX_LINK_FLAGS}"
       OUTPUT_NAME   "${LIBCXX_SHARED_OUTPUT_NAME}"
@@ -247,7 +243,10 @@ if (LIBCXX_ENABLE_SHARED)
     )
   endif()
 
+if (LIBCXX_ENABLE_SHARED)
   list(APPEND LIBCXX_BUILD_TARGETS "cxx_shared")
+endif()
+
   if(WIN32 AND NOT MINGW AND NOT "${CMAKE_HOST_SYSTEM_NAME}" STREQUAL "Windows")
     # Since we most likely do not have a mt.exe replacement, disable the
     # manifest bundling.  This allows a normal cmake invocation to pass which
@@ -260,19 +259,18 @@ if (LIBCXX_ENABLE_SHARED)
                             APPEND_STRING PROPERTY LINK_FLAGS " -Xlinker /MANIFEST:NO")
     endif()
   endif()
-endif()
 
 set(CMAKE_STATIC_LIBRARY_PREFIX "lib")
 
 # Build the static library.
-if (LIBCXX_ENABLE_STATIC)
-  add_library(cxx_static STATIC ${exclude_from_all} ${LIBCXX_SOURCES} ${LIBCXX_HEADERS})
+  add_library(cxx_static STATIC ${LIBCXX_SOURCES} ${LIBCXX_HEADERS})
   target_include_directories(cxx_static PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
   target_link_libraries(cxx_static PUBLIC cxx-headers libcxx-libc-static
                                    PRIVATE ${LIBCXX_LIBRARIES}
                                    PRIVATE libcxx-abi-static)
   set_target_properties(cxx_static
     PROPERTIES
+      EXCLUDE_FROM_ALL "$<IF:$<BOOL:${LIBCXX_ENABLE_STATIC}>,FALSE,TRUE>"
       COMPILE_FLAGS "${LIBCXX_COMPILE_FLAGS}"
       LINK_FLAGS    "${LIBCXX_LINK_FLAGS}"
       OUTPUT_NAME   "${LIBCXX_STATIC_OUTPUT_NAME}"
@@ -295,16 +293,18 @@ if (LIBCXX_ENABLE_STATIC)
     target_compile_definitions(cxx_static PRIVATE _LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS=)
   endif()
 
-  list(APPEND LIBCXX_BUILD_TARGETS "cxx_static")
+  if (LIBCXX_ENABLE_STATIC)
+    list(APPEND LIBCXX_BUILD_TARGETS "cxx_static")
+  endif()
   # Attempt to merge the libc++.a archive and the ABI library archive into one.
   if (LIBCXX_STATICALLY_LINK_ABI_IN_STATIC_LIBRARY)
     target_link_libraries(cxx_static PRIVATE libcxx-abi-static-objects)
   endif()
-endif()
 
 # Add a meta-target for both libraries.
 add_custom_target(cxx DEPENDS ${LIBCXX_BUILD_TARGETS})
 
+# Build the experimental static library
 set(LIBCXX_EXPERIMENTAL_SOURCES
   experimental/keep.cpp
   )
diff --git a/libcxxabi/src/CMakeLists.txt b/libcxxabi/src/CMakeLists.txt
index 480e528b819bb9..e496cf3339164e 100644
--- a/libcxxabi/src/CMakeLists.txt
+++ b/libcxxabi/src/CMakeLists.txt
@@ -184,10 +184,10 @@ if (CMAKE_POSITION_INDEPENDENT_CODE OR NOT DEFINED CMAKE_POSITION_INDEPENDENT_CO
 endif()
 target_compile_options(cxxabi_shared_objects PRIVATE "${LIBCXXABI_ADDITIONAL_COMPILE_FLAGS}")
 
-if (LIBCXXABI_ENABLE_SHARED)
   add_library(cxxabi_shared SHARED)
   set_target_properties(cxxabi_shared
     PROPERTIES
+      EXCLUDE_FROM_ALL "$<IF:$<BOOL:${LIBCXXABI_ENABLE_SHARED}>,FALSE,TRUE>"
       LINK_FLAGS "${LIBCXXABI_LINK_FLAGS}"
       OUTPUT_NAME "${LIBCXXABI_SHARED_OUTPUT_NAME}"
       SOVERSION "1"
@@ -208,10 +208,12 @@ if (LIBCXXABI_ENABLE_SHARED)
     PUBLIC cxxabi_shared_objects
     PRIVATE ${LIBCXXABI_LIBRARIES})
 
+if (LIBCXXABI_ENABLE_SHARED)
   list(APPEND LIBCXXABI_BUILD_TARGETS "cxxabi_shared")
-  if (LIBCXXABI_INSTALL_SHARED_LIBRARY)
-    list(APPEND LIBCXXABI_INSTALL_TARGETS "cxxabi_shared")
-  endif()
+endif()
+if (LIBCXXABI_INSTALL_SHARED_LIBRARY)
+  list(APPEND LIBCXXABI_INSTALL_TARGETS "cxxabi_shared")
+endif()
 
   # TODO: Move this to libc++'s HandleLibCXXABI.cmake since this is effectively trying to control
   #       what libc++ re-exports.
@@ -254,7 +256,6 @@ if (LIBCXXABI_ENABLE_SHARED)
       reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/personality-v0.exp")
     endif()
   endif()
-endif()
 
 # Build the static library.
 add_library(cxxabi_static_objects OBJECT EXCLUDE_FROM_ALL ${LIBCXXABI_SOURCES} ${LIBCXXABI_HEADERS})
@@ -294,13 +295,13 @@ if(LIBCXXABI_HERMETIC_STATIC_LIBRARY)
       _LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS=)
 endif()
 
-if (LIBCXXABI_ENABLE_STATIC)
   add_library(cxxabi_static STATIC)
   if (LIBCXXABI_USE_LLVM_UNWINDER AND NOT LIBCXXABI_STATICALLY_LINK_UNWINDER_IN_STATIC_LIBRARY)
     target_link_libraries(cxxabi_static PUBLIC unwind_static)
   endif()
   set_target_properties(cxxabi_static
     PROPERTIES
+      EXCLUDE_FROM_ALL "$<IF:$<BOOL:${LIBCXXABI_ENABLE_STATIC}>,FALSE,TRUE>"
       LINK_FLAGS "${LIBCXXABI_LINK_FLAGS}"
       OUTPUT_NAME "${LIBCXXABI_STATIC_OUTPUT_NAME}"
     )
@@ -308,10 +309,11 @@ if (LIBCXXABI_ENABLE_STATIC)
     PUBLIC cxxabi_static_objects
     PRIVATE ${LIBCXXABI_STATIC_LIBRARIES} ${LIBCXXABI_LIBRARIES})
 
+if (LIBCXXABI_ENABLE_STATIC)
   list(APPEND LIBCXXABI_BUILD_TARGETS "cxxabi_static")
-  if (LIBCXXABI_INSTALL_STATIC_LIBRARY)
-    list(APPEND LIBCXXABI_INSTALL_TARGETS "cxxabi_static")
-  endif()
+endif()
+if (LIBCXXABI_INSTALL_STATIC_LIBRARY)
+  list(APPEND LIBCXXABI_INSTALL_TARGETS "cxxabi_static")
 endif()
 
 # Add a meta-target for both libraries.
diff --git a/libunwind/src/CMakeLists.txt b/libunwind/src/CMakeLists.txt
index 125cf4ffe912a3..3065bfc8a07050 100644
--- a/libunwind/src/CMakeLists.txt
+++ b/libunwind/src/CMakeLists.txt
@@ -153,11 +153,11 @@ if (CMAKE_POSITION_INDEPENDENT_CODE OR NOT DEFINED CMAKE_POSITION_INDEPENDENT_CO
   set_target_properties(unwind_shared_objects PROPERTIES POSITION_INDEPENDENT_CODE ON) # must set manually because it's an object library
 endif()
 
-if (LIBUNWIND_ENABLE_SHARED)
   add_library(unwind_shared SHARED)
   target_link_libraries(unwind_shared PUBLIC unwind_shared_objects)
   set_target_properties(unwind_shared
     PROPERTIES
+      EXCLUDE_FROM_ALL "$<IF:$<BOOL:${LIBUNWIND_ENABLE_SHARED}>,FALSE,TRUE>"
       LINK_FLAGS "${LIBUNWIND_LINK_FLAGS}"
       LINKER_LANGUAGE C
       OUTPUT_NAME "${LIBUNWIND_SHARED_OUTPUT_NAME}"
@@ -165,10 +165,11 @@ if (LIBUNWIND_ENABLE_SHARED)
       SOVERSION   "1"
   )
 
+if (LIBUNWIND_ENABLE_SHARED)
   list(APPEND LIBUNWIND_BUILD_TARGETS "unwind_shared")
-  if (LIBUNWIND_INSTALL_SHARED_LIBRARY)
-    list(APPEND LIBUNWIND_INSTALL_TARGETS "unwind_shared")
-  endif()
+endif()
+if (LIBUNWIND_INSTALL_SHARED_LIBRARY)
+  list(APPEND LIBUNWIND_INSTALL_TARGETS "unwind_shared")
 endif()
 
 # Build the static library.
@@ -199,20 +200,21 @@ if(LIBUNWIND_HIDE_SYMBOLS)
   target_compile_definitions(unwind_static_objects PRIVATE _LIBUNWIND_HIDE_SYMBOLS)
 endif()
 
-if (LIBUNWIND_ENABLE_STATIC)
   add_library(unwind_static STATIC)
   target_link_libraries(unwind_static PUBLIC unwind_static_objects)
   set_target_properties(unwind_static
     PROPERTIES
+      EXCLUDE_FROM_ALL "$<IF:$<BOOL:${LIBUNWIND_ENABLE_STATIC}>,FALSE,TRUE>"
       LINK_FLAGS "${LIBUNWIND_LINK_FLAGS}"
       LINKER_LANGUAGE C
       OUTPUT_NAME "${LIBUNWIND_STATIC_OUTPUT_NAME}"
   )
 
+if (LIBUNWIND_ENABLE_STATIC)
   list(APPEND LIBUNWIND_BUILD_TARGETS "unwind_static")
-  if (LIBUNWIND_INSTALL_STATIC_LIBRARY)
-    list(APPEND LIBUNWIND_INSTALL_TARGETS "unwind_static")
-  endif()
+endif()
+if (LIBUNWIND_INSTALL_STATIC_LIBRARY)
+  list(APPEND LIBUNWIND_INSTALL_TARGETS "unwind_static")
 endif()
 
 # Add a meta-target for both libraries.

From f8b7a65395a07073feff367145965214d95ba99a Mon Sep 17 00:00:00 2001
From: Petr Kurapov <petr.a.kurapov@intel.com>
Date: Thu, 10 Oct 2024 14:04:52 +0200
Subject: [PATCH 006/177] [MLIR][GPU-LLVM] Add in-pass signature update for
 opencl kernels (#105664)

Default to Global address space for memrefs that do not have an explicit address space set in the IR.

---------

Co-authored-by: Victor Perez <victor.perez@intel.com>
Co-authored-by: Jakub Kuderski <kubakuderski@gmail.com>
Co-authored-by: Victor Perez <victor.perez@codeplay.com>
---
 .../Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp  | 49 ++++++++++++++++++
 .../GPUToLLVMSPV/gpu-to-llvm-spv.mlir         | 50 ++++++++++++++++---
 2 files changed, 93 insertions(+), 6 deletions(-)

diff --git a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
index 544f1f4a4f6a79..bb6a38c0e76edf 100644
--- a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
+++ b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
@@ -34,6 +34,8 @@
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/FormatVariadic.h"
 
+#define DEBUG_TYPE "gpu-to-llvm-spv"
+
 using namespace mlir;
 
 namespace mlir {
@@ -316,6 +318,38 @@ struct GPUShuffleConversion final : ConvertOpToLLVMPattern<gpu::ShuffleOp> {
   }
 };
 
+class MemorySpaceToOpenCLMemorySpaceConverter final : public TypeConverter {
+public:
+  MemorySpaceToOpenCLMemorySpaceConverter(MLIRContext *ctx) {
+    addConversion([](Type t) { return t; });
+    addConversion([ctx](BaseMemRefType memRefType) -> std::optional<Type> {
+      // Attach global addr space attribute to memrefs with no addr space attr
+      Attribute memSpaceAttr = memRefType.getMemorySpace();
+      if (memSpaceAttr)
+        return std::nullopt;
+
+      unsigned globalAddrspace = storageClassToAddressSpace(
+          spirv::ClientAPI::OpenCL, spirv::StorageClass::CrossWorkgroup);
+      Attribute addrSpaceAttr =
+          IntegerAttr::get(IntegerType::get(ctx, 64), globalAddrspace);
+      if (auto rankedType = dyn_cast<MemRefType>(memRefType)) {
+        return MemRefType::get(memRefType.getShape(),
+                               memRefType.getElementType(),
+                               rankedType.getLayout(), addrSpaceAttr);
+      }
+      return UnrankedMemRefType::get(memRefType.getElementType(),
+                                     addrSpaceAttr);
+    });
+    addConversion([this](FunctionType type) {
+      auto inputs = llvm::map_to_vector(
+          type.getInputs(), [this](Type ty) { return convertType(ty); });
+      auto results = llvm::map_to_vector(
+          type.getResults(), [this](Type ty) { return convertType(ty); });
+      return FunctionType::get(type.getContext(), inputs, results);
+    });
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // Subgroup query ops.
 //===----------------------------------------------------------------------===//
@@ -382,6 +416,21 @@ struct GPUToLLVMSPVConversionPass final
     LLVMTypeConverter converter(context, options);
     LLVMConversionTarget target(*context);
 
+    // Force OpenCL address spaces when they are not present
+    {
+      MemorySpaceToOpenCLMemorySpaceConverter converter(context);
+      AttrTypeReplacer replacer;
+      replacer.addReplacement([&converter](BaseMemRefType origType)
+                                  -> std::optional<BaseMemRefType> {
+        return converter.convertType<BaseMemRefType>(origType);
+      });
+
+      replacer.recursivelyReplaceElementsIn(getOperation(),
+                                            /*replaceAttrs=*/true,
+                                            /*replaceLocs=*/false,
+                                            /*replaceTypes=*/true);
+    }
+
     target.addIllegalOp<gpu::BarrierOp, gpu::BlockDimOp, gpu::BlockIdOp,
                         gpu::GPUFuncOp, gpu::GlobalIdOp, gpu::GridDimOp,
                         gpu::LaneIdOp, gpu::NumSubgroupsOp, gpu::ReturnOp,
diff --git a/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir b/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir
index 910105ddf69586..c143d030ed362b 100644
--- a/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir
+++ b/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir
@@ -444,20 +444,20 @@ gpu.module @kernels {
     gpu.return
   }
 
-  // CHECK-64:   llvm.func spir_kernelcc @kernel_with_conv_args(%{{.*}}: i64, %{{.*}}: !llvm.ptr, %{{.*}}: !llvm.ptr, %{{.*}}: i64) attributes {gpu.kernel} {
-  // CHECK-32:   llvm.func spir_kernelcc @kernel_with_conv_args(%{{.*}}: i32, %{{.*}}: !llvm.ptr, %{{.*}}: !llvm.ptr, %{{.*}}: i32) attributes {gpu.kernel} {
+  // CHECK-64:   llvm.func spir_kernelcc @kernel_with_conv_args(%{{.*}}: i64, %{{.*}}: !llvm.ptr<1>, %{{.*}}: !llvm.ptr<1>, %{{.*}}: i64) attributes {gpu.kernel} {
+  // CHECK-32:   llvm.func spir_kernelcc @kernel_with_conv_args(%{{.*}}: i32, %{{.*}}: !llvm.ptr<1>, %{{.*}}: !llvm.ptr<1>, %{{.*}}: i32) attributes {gpu.kernel} {
   gpu.func @kernel_with_conv_args(%arg0: index, %arg1: memref<index>) kernel {
     gpu.return
   }
 
-  // CHECK-64:   llvm.func spir_kernelcc @kernel_with_sized_memref(%{{.*}}: !llvm.ptr, %{{.*}}: !llvm.ptr, %{{.*}}: i64, %{{.*}}: i64, %{{.*}}: i64) attributes {gpu.kernel} {
-  // CHECK-32:   llvm.func spir_kernelcc @kernel_with_sized_memref(%{{.*}}: !llvm.ptr, %{{.*}}: !llvm.ptr, %{{.*}}: i32, %{{.*}}: i32, %{{.*}}: i32) attributes {gpu.kernel} {
+  // CHECK-64:   llvm.func spir_kernelcc @kernel_with_sized_memref(%{{.*}}: !llvm.ptr<1>, %{{.*}}: !llvm.ptr<1>, %{{.*}}: i64, %{{.*}}: i64, %{{.*}}: i64) attributes {gpu.kernel} {
+  // CHECK-32:   llvm.func spir_kernelcc @kernel_with_sized_memref(%{{.*}}: !llvm.ptr<1>, %{{.*}}: !llvm.ptr<1>, %{{.*}}: i32, %{{.*}}: i32, %{{.*}}: i32) attributes {gpu.kernel} {
   gpu.func @kernel_with_sized_memref(%arg0: memref<1xindex>) kernel {
     gpu.return
   }
 
-  // CHECK-64:   llvm.func spir_kernelcc @kernel_with_ND_memref(%{{.*}}: !llvm.ptr, %{{.*}}: !llvm.ptr, %{{.*}}: i64, %{{.*}}: i64, %{{.*}}: i64, %{{.*}}: i64, %{{.*}}: i64, %{{.*}}: i64, %{{.*}}: i64) attributes {gpu.kernel} {
-  // CHECK-32:   llvm.func spir_kernelcc @kernel_with_ND_memref(%{{.*}}: !llvm.ptr, %{{.*}}: !llvm.ptr, %{{.*}}: i32, %{{.*}}: i32, %{{.*}}: i32, %{{.*}}: i32, %{{.*}}: i32, %{{.*}}: i32, %{{.*}}: i32) attributes {gpu.kernel} {
+  // CHECK-64:   llvm.func spir_kernelcc @kernel_with_ND_memref(%{{.*}}: !llvm.ptr<1>, %{{.*}}: !llvm.ptr<1>, %{{.*}}: i64, %{{.*}}: i64, %{{.*}}: i64, %{{.*}}: i64, %{{.*}}: i64, %{{.*}}: i64, %{{.*}}: i64) attributes {gpu.kernel} {
+  // CHECK-32:   llvm.func spir_kernelcc @kernel_with_ND_memref(%{{.*}}: !llvm.ptr<1>, %{{.*}}: !llvm.ptr<1>, %{{.*}}: i32, %{{.*}}: i32, %{{.*}}: i32, %{{.*}}: i32, %{{.*}}: i32, %{{.*}}: i32, %{{.*}}: i32) attributes {gpu.kernel} {
   gpu.func @kernel_with_ND_memref(%arg0: memref<128x128x128xindex>) kernel {
     gpu.return
   }
@@ -566,6 +566,44 @@ gpu.module @kernels {
 
 // -----
 
+gpu.module @kernels {
+// CHECK:         llvm.func spir_funccc @_Z12get_group_idj(i32)
+// CHECK-LABEL:   llvm.func spir_funccc @no_address_spaces(
+// CHECK-SAME:                                             %{{[a-zA-Z_][a-zA-Z0-9_]*}}: !llvm.ptr<1>
+// CHECK-SAME:                                             %{{[a-zA-Z_][a-zA-Z0-9_]*}}: !llvm.ptr<1>
+// CHECK-SAME:                                             %{{[a-zA-Z_][a-zA-Z0-9_]*}}: !llvm.ptr<1>
+  gpu.func @no_address_spaces(%arg0: memref<f32>, %arg1: memref<f32, #gpu.address_space<global>>, %arg2: memref<f32>) {
+    gpu.return
+  }
+
+// CHECK-LABEL:   llvm.func spir_kernelcc @no_address_spaces_complex(
+// CHECK-SAME:                                             %{{[a-zA-Z_][a-zA-Z0-9_]*}}: !llvm.ptr<1>
+// CHECK-SAME:                                             %{{[a-zA-Z_][a-zA-Z0-9_]*}}: !llvm.ptr<1>
+// CHECK:         func.call @no_address_spaces_callee(%{{[0-9]+}}, %{{[0-9]+}})
+// CHECK-SAME:                                        : (memref<2x2xf32, 1>, memref<4xf32, 1>)
+  gpu.func @no_address_spaces_complex(%arg0: memref<2x2xf32>, %arg1: memref<4xf32>) kernel {
+    func.call @no_address_spaces_callee(%arg0, %arg1) : (memref<2x2xf32>, memref<4xf32>) -> ()
+    gpu.return
+  }
+// CHECK-LABEL:   func.func @no_address_spaces_callee(
+// CHECK-SAME:                                             [[ARG0:%.*]]: memref<2x2xf32, 1> 
+// CHECK-SAME:                                             [[ARG1:%.*]]: memref<4xf32, 1>
+// CHECK:         [[C0:%.*]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:         [[I0:%.*]] = llvm.call spir_funccc @_Z12get_group_idj([[C0]]) {
+// CHECK-32:         [[I1:%.*]] = builtin.unrealized_conversion_cast [[I0]] : i32 to index
+// CHECK-64:         [[I1:%.*]] = builtin.unrealized_conversion_cast [[I0]] : i64 to index
+// CHECK:         [[LD:%.*]] = memref.load [[ARG0]]{{\[}}[[I1]], [[I1]]{{\]}} : memref<2x2xf32, 1>
+// CHECK:         memref.store [[LD]], [[ARG1]]{{\[}}[[I1]]{{\]}} : memref<4xf32, 1>
+  func.func @no_address_spaces_callee(%arg0: memref<2x2xf32>, %arg1: memref<4xf32>) {
+    %block_id = gpu.block_id x
+    %0 = memref.load %arg0[%block_id, %block_id] : memref<2x2xf32>
+    memref.store %0, %arg1[%block_id] : memref<4xf32>
+    func.return
+  }
+}
+
+// -----
+
 // Lowering of subgroup query operations
 
 // CHECK-DAG: llvm.func spir_funccc @_Z18get_sub_group_size() -> i32 attributes {no_unwind, will_return}

From 55d51dd9dca8220ffaf9260d56dae9f5c34b7120 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Thu, 10 Oct 2024 14:10:38 +0200
Subject: [PATCH 007/177] [clang][bytecode] Fix temporary lvalue base
 expression (#111808)

We need to use the MaterializeTemporaryExpr here so the checks in
ExprConstant.cpp do the right thing.
---
 clang/lib/AST/ByteCode/Compiler.cpp |  9 +++++----
 clang/lib/AST/ByteCode/Compiler.h   |  3 ++-
 clang/test/AST/ByteCode/cxx1z.cpp   | 12 ++++++++++++
 3 files changed, 19 insertions(+), 5 deletions(-)
 create mode 100644 clang/test/AST/ByteCode/cxx1z.cpp

diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index fe44238ea11869..ba4c5600d613b0 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -2728,7 +2728,7 @@ bool Compiler<Emitter>::VisitMaterializeTemporaryExpr(
 
     const Expr *Inner = E->getSubExpr()->skipRValueSubobjectAdjustments();
     if (std::optional<unsigned> LocalIndex =
-            allocateLocal(Inner, E->getExtendingDecl())) {
+            allocateLocal(E, Inner->getType(), E->getExtendingDecl())) {
       InitLinkScope<Emitter> ILS(this, InitLink::Temp(*LocalIndex));
       if (!this->emitGetPtrLocal(*LocalIndex, E))
         return false;
@@ -4029,7 +4029,8 @@ unsigned Compiler<Emitter>::allocateLocalPrimitive(DeclTy &&Src, PrimType Ty,
 
 template <class Emitter>
 std::optional<unsigned>
-Compiler<Emitter>::allocateLocal(DeclTy &&Src, const ValueDecl *ExtendingDecl) {
+Compiler<Emitter>::allocateLocal(DeclTy &&Src, QualType Ty,
+                                 const ValueDecl *ExtendingDecl) {
   // Make sure we don't accidentally register the same decl twice.
   if ([[maybe_unused]] const auto *VD =
           dyn_cast_if_present<ValueDecl>(Src.dyn_cast<const Decl *>())) {
@@ -4037,7 +4038,6 @@ Compiler<Emitter>::allocateLocal(DeclTy &&Src, const ValueDecl *ExtendingDecl) {
     assert(!Locals.contains(VD));
   }
 
-  QualType Ty;
   const ValueDecl *Key = nullptr;
   const Expr *Init = nullptr;
   bool IsTemporary = false;
@@ -4050,7 +4050,8 @@ Compiler<Emitter>::allocateLocal(DeclTy &&Src, const ValueDecl *ExtendingDecl) {
   }
   if (auto *E = Src.dyn_cast<const Expr *>()) {
     IsTemporary = true;
-    Ty = E->getType();
+    if (Ty.isNull())
+      Ty = E->getType();
   }
 
   Descriptor *D = P.createDescriptor(
diff --git a/clang/lib/AST/ByteCode/Compiler.h b/clang/lib/AST/ByteCode/Compiler.h
index 22e078f3fe546f..4253e7b3248c9f 100644
--- a/clang/lib/AST/ByteCode/Compiler.h
+++ b/clang/lib/AST/ByteCode/Compiler.h
@@ -302,7 +302,8 @@ class Compiler : public ConstStmtVisitor<Compiler<Emitter>, bool>,
 
   /// Allocates a space storing a local given its type.
   std::optional<unsigned>
-  allocateLocal(DeclTy &&Decl, const ValueDecl *ExtendingDecl = nullptr);
+  allocateLocal(DeclTy &&Decl, QualType Ty = QualType(),
+                const ValueDecl *ExtendingDecl = nullptr);
   unsigned allocateTemporary(const Expr *E);
 
 private:
diff --git a/clang/test/AST/ByteCode/cxx1z.cpp b/clang/test/AST/ByteCode/cxx1z.cpp
new file mode 100644
index 00000000000000..2b5d215f016548
--- /dev/null
+++ b/clang/test/AST/ByteCode/cxx1z.cpp
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -std=c++17 -verify=expected,both %s
+// RUN: %clang_cc1 -std=c++17 -verify=ref,both %s
+
+template<typename T, T val> struct A {};
+namespace Temp {
+  struct S { int n; };
+  constexpr S &addr(S &&s) { return s; }
+  A<S &, addr({})> a; // both-error {{reference to temporary object}}
+  A<S *, &addr({})> b; // both-error {{pointer to temporary object}}
+  A<int &, addr({}).n> c; // both-error {{reference to subobject of temporary object}}
+  A<int *, &addr({}).n> d; // both-error {{pointer to subobject of temporary object}}
+}

From b773da0c5eed06f21f4caeea5eae47cacefb376c Mon Sep 17 00:00:00 2001
From: Vladislav Dzhidzhoev <vdzhidzhoev@accesssoftek.com>
Date: Thu, 10 Oct 2024 14:21:25 +0200
Subject: [PATCH 008/177] [lldb][test] Use $(STRIP) instead of strip in API
 tests (Darwin-only change) (#111816)

This makes tests more portable.
Make variables for LLVM utils are passed to `make` on Darwin as well.

Co-authored-by: Vladimir Vereschaka <vvereschaka@accesssoftek.com>
---
 .../Python/lldbsuite/test/builders/builder.py | 46 +++++++++----------
 lldb/test/API/lang/objc/hidden-ivars/Makefile |  4 +-
 .../API/lang/objc/objc-ivar-stripped/Makefile |  2 +-
 .../objc/objc-static-method-stripped/Makefile |  2 +-
 lldb/test/API/macosx/add-dsym/Makefile        |  2 +-
 lldb/test/API/tools/lldb-dap/module/Makefile  |  2 +-
 .../tools/lldb-dap/terminated-event/Makefile  |  2 +-
 7 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/lldb/packages/Python/lldbsuite/test/builders/builder.py b/lldb/packages/Python/lldbsuite/test/builders/builder.py
index f813d68e46e82a..d399a5b228c131 100644
--- a/lldb/packages/Python/lldbsuite/test/builders/builder.py
+++ b/lldb/packages/Python/lldbsuite/test/builders/builder.py
@@ -169,31 +169,31 @@ def getToolchainUtil(util_name):
         if not os.getenv("LLVM_AR"):
             utils.extend(["LLVM_AR=%s" % getToolchainUtil("llvm-ar")])
 
-        if not lldbplatformutil.platformIsDarwin():
-            if cc_type in ["clang", "cc", "gcc"]:
-                util_paths = {}
-                # Assembly a toolchain side tool cmd based on passed CC.
-                for var, name in util_names.items():
-                    # Do not override explicity specified tool from the cmd line.
-                    if not os.getenv(var):
-                        util_paths[var] = getToolchainUtil("llvm-" + name)
-                    else:
-                        util_paths[var] = os.getenv(var)
-                utils.extend(["AR=%s" % util_paths["ARCHIVER"]])
-
-                # Look for llvm-dwp or gnu dwp
-                if not lldbutil.which(util_paths["DWP"]):
-                    util_paths["DWP"] = getToolchainUtil("llvm-dwp")
-                if not lldbutil.which(util_paths["DWP"]):
-                    util_paths["DWP"] = lldbutil.which("llvm-dwp")
+        if cc_type in ["clang", "cc", "gcc"]:
+            util_paths = {}
+            # Assembly a toolchain side tool cmd based on passed CC.
+            for var, name in util_names.items():
+                # Do not override explicity specified tool from the cmd line.
+                if not os.getenv(var):
+                    util_paths[var] = getToolchainUtil("llvm-" + name)
+                else:
+                    util_paths[var] = os.getenv(var)
+            utils.extend(["AR=%s" % util_paths["ARCHIVER"]])
+
+            # Look for llvm-dwp or gnu dwp
+            if not lldbutil.which(util_paths["DWP"]):
+                util_paths["DWP"] = getToolchainUtil("llvm-dwp")
+            if not lldbutil.which(util_paths["DWP"]):
+                util_paths["DWP"] = lldbutil.which("llvm-dwp")
+            if not util_paths["DWP"]:
+                util_paths["DWP"] = lldbutil.which("dwp")
                 if not util_paths["DWP"]:
-                    util_paths["DWP"] = lldbutil.which("dwp")
-                    if not util_paths["DWP"]:
-                        del util_paths["DWP"]
+                    del util_paths["DWP"]
 
-                for var, path in util_paths.items():
-                    utils.append("%s=%s" % (var, path))
-        else:
+            for var, path in util_paths.items():
+                utils.append("%s=%s" % (var, path))
+
+        if lldbplatformutil.platformIsDarwin():
             utils.extend(["AR=%slibtool" % os.getenv("CROSS_COMPILE", "")])
 
         return [
diff --git a/lldb/test/API/lang/objc/hidden-ivars/Makefile b/lldb/test/API/lang/objc/hidden-ivars/Makefile
index 283e8a118fb16a..c94c0dee1b9ce9 100644
--- a/lldb/test/API/lang/objc/hidden-ivars/Makefile
+++ b/lldb/test/API/lang/objc/hidden-ivars/Makefile
@@ -14,8 +14,8 @@ endif
 
 stripped: a.out libInternalDefiner.dylib
 	mkdir stripped
-	strip -Sx a.out -o stripped/a.out
-	strip -Sx libInternalDefiner.dylib -o stripped/libInternalDefiner.dylib
+	$(STRIP) -Sx a.out -o stripped/a.out
+	$(STRIP) -Sx libInternalDefiner.dylib -o stripped/libInternalDefiner.dylib
 ifneq "$(CODESIGN)" ""
 	$(CODESIGN) -fs - stripped/a.out
 endif
diff --git a/lldb/test/API/lang/objc/objc-ivar-stripped/Makefile b/lldb/test/API/lang/objc/objc-ivar-stripped/Makefile
index 8b63215d6d9da6..eed66d2a965d11 100644
--- a/lldb/test/API/lang/objc/objc-ivar-stripped/Makefile
+++ b/lldb/test/API/lang/objc/objc-ivar-stripped/Makefile
@@ -6,7 +6,7 @@ all:        a.out.stripped
 include Makefile.rules
 
 a.out.stripped: a.out.dSYM
-	strip -o a.out.stripped a.out
+	$(STRIP) -o a.out.stripped a.out
 ifneq "$(CODESIGN)" ""
 	$(CODESIGN) -fs - a.out.stripped
 endif
diff --git a/lldb/test/API/lang/objc/objc-static-method-stripped/Makefile b/lldb/test/API/lang/objc/objc-static-method-stripped/Makefile
index ed312938c9cd11..4936553c56f7c0 100644
--- a/lldb/test/API/lang/objc/objc-static-method-stripped/Makefile
+++ b/lldb/test/API/lang/objc/objc-static-method-stripped/Makefile
@@ -4,7 +4,7 @@ LD_EXTRAS := -lobjc -framework Foundation
 default:        a.out.stripped
 
 a.out.stripped: a.out.dSYM
-	strip -o a.out.stripped a.out
+	$(STRIP) -o a.out.stripped a.out
 	ln -sf a.out.dSYM a.out.stripped.dSYM
 
 include Makefile.rules
diff --git a/lldb/test/API/macosx/add-dsym/Makefile b/lldb/test/API/macosx/add-dsym/Makefile
index 4e1ec2202d0b09..b949b308d3acce 100644
--- a/lldb/test/API/macosx/add-dsym/Makefile
+++ b/lldb/test/API/macosx/add-dsym/Makefile
@@ -8,7 +8,7 @@ hide.app/Contents/a.out.dSYM:
 	mkdir hide.app
 	mkdir hide.app/Contents
 	mv a.out.dSYM hide.app/Contents
-	strip -x a.out
+	$(STRIP) -x a.out
 ifneq "$(CODESIGN)" ""
 	$(CODESIGN) -fs - a.out
 endif
diff --git a/lldb/test/API/tools/lldb-dap/module/Makefile b/lldb/test/API/tools/lldb-dap/module/Makefile
index b30baf48b972ef..c7d626a1a7e4c1 100644
--- a/lldb/test/API/tools/lldb-dap/module/Makefile
+++ b/lldb/test/API/tools/lldb-dap/module/Makefile
@@ -10,7 +10,7 @@ include Makefile.rules
 all: a.out.stripped
 
 a.out.stripped:
-	strip -o a.out.stripped a.out
+	$(STRIP) -o a.out.stripped a.out
 
 ifneq "$(CODESIGN)" ""
 	$(CODESIGN) -fs - a.out.stripped
diff --git a/lldb/test/API/tools/lldb-dap/terminated-event/Makefile b/lldb/test/API/tools/lldb-dap/terminated-event/Makefile
index b30baf48b972ef..c7d626a1a7e4c1 100644
--- a/lldb/test/API/tools/lldb-dap/terminated-event/Makefile
+++ b/lldb/test/API/tools/lldb-dap/terminated-event/Makefile
@@ -10,7 +10,7 @@ include Makefile.rules
 all: a.out.stripped
 
 a.out.stripped:
-	strip -o a.out.stripped a.out
+	$(STRIP) -o a.out.stripped a.out
 
 ifneq "$(CODESIGN)" ""
 	$(CODESIGN) -fs - a.out.stripped

From 36a0d442eb4d2f1e0782bc2a1b1715fc7631faec Mon Sep 17 00:00:00 2001
From: Harrison Hao <57025411+harrisonGPU@users.noreply.github.com>
Date: Thu, 10 Oct 2024 20:45:40 +0800
Subject: [PATCH 009/177] [LLVM][DOCS] Add documentation for 'host' and
 'Native' options in LLVM_TARGETS_TO_BUILD. (#111382)

From https://github.com/llvm/llvm-project/issues/111356
---
 llvm/docs/CMake.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/llvm/docs/CMake.rst b/llvm/docs/CMake.rst
index 191230101c4d24..91e34781ef307e 100644
--- a/llvm/docs/CMake.rst
+++ b/llvm/docs/CMake.rst
@@ -847,6 +847,12 @@ enabled sub-projects. Nearly all of these variable names begin with
   The full list, as of March 2023, is:
   ``AArch64;AMDGPU;ARM;AVR;BPF;Hexagon;Lanai;LoongArch;Mips;MSP430;NVPTX;PowerPC;RISCV;Sparc;SystemZ;VE;WebAssembly;X86;XCore``
 
+  You can also specify ``host`` or ``Native`` to automatically detect and
+  include the target corresponding to the host machine's architecture, or
+  use ``all`` to include all available targets.
+  For example, on an x86_64 machine, specifying ``-DLLVM_TARGETS_TO_BUILD=host``
+  will include the ``X86`` target.
+
 **LLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN**:BOOL
   If enabled, the compiler version check will only warn when using a toolchain
   which is about to be deprecated, instead of emitting an error.

From 1f919aa77805b951fb06b44732a87f1f83929247 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Thu, 10 Oct 2024 14:10:41 +0100
Subject: [PATCH 010/177] VectorCombine: lift one-use limitation in
 foldExtractedCmps (#110902)

There are artificial one-use limitations on foldExtractedCmps. Adjust
the costs to account for multi-use, and strip the one-use matcher,
lifting the limitations.
---
 .../Transforms/Vectorize/VectorCombine.cpp    | 25 ++++-----
 .../VectorCombine/X86/extract-cmp-binop.ll    | 54 +++++++++++++++++++
 2 files changed, 67 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 627edb680dfa1e..58145c7e3c5913 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1038,23 +1038,20 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
 
   // The compare predicates should match, and each compare should have a
   // constant operand.
-  // TODO: Relax the one-use constraints.
   Value *B0 = I.getOperand(0), *B1 = I.getOperand(1);
   Instruction *I0, *I1;
   Constant *C0, *C1;
   CmpInst::Predicate P0, P1;
-  if (!match(B0, m_OneUse(m_Cmp(P0, m_Instruction(I0), m_Constant(C0)))) ||
-      !match(B1, m_OneUse(m_Cmp(P1, m_Instruction(I1), m_Constant(C1)))) ||
-      P0 != P1)
+  if (!match(B0, m_Cmp(P0, m_Instruction(I0), m_Constant(C0))) ||
+      !match(B1, m_Cmp(P1, m_Instruction(I1), m_Constant(C1))) || P0 != P1)
     return false;
 
   // The compare operands must be extracts of the same vector with constant
   // extract indexes.
-  // TODO: Relax the one-use constraints.
   Value *X;
   uint64_t Index0, Index1;
-  if (!match(I0, m_OneUse(m_ExtractElt(m_Value(X), m_ConstantInt(Index0)))) ||
-      !match(I1, m_OneUse(m_ExtractElt(m_Specific(X), m_ConstantInt(Index1)))))
+  if (!match(I0, m_ExtractElt(m_Value(X), m_ConstantInt(Index0))) ||
+      !match(I1, m_ExtractElt(m_Specific(X), m_ConstantInt(Index1))))
     return false;
 
   auto *Ext0 = cast<ExtractElementInst>(I0);
@@ -1073,14 +1070,16 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
     return false;
 
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  InstructionCost Ext0Cost =
+                      TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0),
+                  Ext1Cost =
+                      TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
   InstructionCost OldCost =
-      TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
-  OldCost += TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
-  OldCost +=
+      Ext0Cost + Ext1Cost +
       TTI.getCmpSelInstrCost(CmpOpcode, I0->getType(),
                              CmpInst::makeCmpResultType(I0->getType()), Pred) *
-      2;
-  OldCost += TTI.getArithmeticInstrCost(I.getOpcode(), I.getType());
+          2 +
+      TTI.getArithmeticInstrCost(I.getOpcode(), I.getType());
 
   // The proposed vector pattern is:
   // vcmp = cmp Pred X, VecC
@@ -1096,6 +1095,8 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
                                 ShufMask);
   NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy);
   NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex);
+  NewCost += Ext0->hasOneUse() ? 0 : Ext0Cost;
+  NewCost += Ext1->hasOneUse() ? 0 : Ext1Cost;
 
   // Aggressively form vector ops if the cost is equal because the transform
   // may enable further optimization.
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-cmp-binop.ll b/llvm/test/Transforms/VectorCombine/X86/extract-cmp-binop.ll
index 462bb13ae7d12a..be5359f549ac94 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-cmp-binop.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-cmp-binop.ll
@@ -92,6 +92,60 @@ define i1 @icmp_add_v8i32(<8 x i32> %a) {
   ret i1 %r
 }
 
+declare void @use()
+
+define i1 @fcmp_and_v2f64_multiuse(<2 x double> %a) {
+; SSE-LABEL: @fcmp_and_v2f64_multiuse(
+; SSE-NEXT:    [[E1:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0
+; SSE-NEXT:    call void @use(double [[E1]])
+; SSE-NEXT:    [[E2:%.*]] = extractelement <2 x double> [[A]], i32 1
+; SSE-NEXT:    [[CMP1:%.*]] = fcmp olt double [[E1]], 4.200000e+01
+; SSE-NEXT:    [[CMP2:%.*]] = fcmp olt double [[E2]], -8.000000e+00
+; SSE-NEXT:    [[R:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; SSE-NEXT:    call void @use(i1 [[R]])
+; SSE-NEXT:    ret i1 [[R]]
+;
+; AVX-LABEL: @fcmp_and_v2f64_multiuse(
+; AVX-NEXT:    [[E1:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0
+; AVX-NEXT:    call void @use(double [[E1]])
+; AVX-NEXT:    [[TMP1:%.*]] = fcmp olt <2 x double> [[A]], <double 4.200000e+01, double -8.000000e+00>
+; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x i1> [[TMP1]], <2 x i1> poison, <2 x i32> <i32 1, i32 poison>
+; AVX-NEXT:    [[TMP2:%.*]] = and <2 x i1> [[TMP1]], [[SHIFT]]
+; AVX-NEXT:    [[R:%.*]] = extractelement <2 x i1> [[TMP2]], i64 0
+; AVX-NEXT:    call void @use(i1 [[R]])
+; AVX-NEXT:    ret i1 [[R]]
+;
+  %e1 = extractelement <2 x double> %a, i32 0
+  call void @use(double %e1)
+  %e2 = extractelement <2 x double> %a, i32 1
+  %cmp1 = fcmp olt double %e1, 42.0
+  %cmp2 = fcmp olt double %e2, -8.0
+  %r = and i1 %cmp1, %cmp2
+  call void @use(i1 %r)
+  ret i1 %r
+}
+
+define i1 @icmp_xor_v4i32_multiuse(<4 x i32> %a) {
+; CHECK-LABEL: @icmp_xor_v4i32_multiuse(
+; CHECK-NEXT:    [[E2:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 1
+; CHECK-NEXT:    call void @use(i32 [[E2]])
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <4 x i32> [[A]], <i32 poison, i32 -8, i32 poison, i32 42>
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> poison, <4 x i32> <i32 poison, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], [[SHIFT]]
+; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x i1> [[TMP2]], i64 1
+; CHECK-NEXT:    call void @use(i1 [[R]])
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %e1 = extractelement <4 x i32> %a, i32 3
+  %e2 = extractelement <4 x i32> %a, i32 1
+  call void @use(i32 %e2)
+  %cmp1 = icmp sgt i32 %e1, 42
+  %cmp2 = icmp sgt i32 %e2, -8
+  %r = xor i1 %cmp1, %cmp2
+  call void @use(i1 %r)
+  ret i1 %r
+}
+
 ; Negative test - this could CSE/simplify.
 
 define i1 @same_extract_index(<4 x i32> %a) {

From 159d694c05500a656775f4cbd6931dae9aab290a Mon Sep 17 00:00:00 2001
From: "A. Jiang" <de34@live.cn>
Date: Thu, 10 Oct 2024 21:14:05 +0800
Subject: [PATCH 011/177] [libc++] __uglify internal member names of iterators
 in `bitset` (#111127)

[template.bitset.general] indicates that `bitset` shouldn't have member
typedef-names `iterator` and `const_iterator`. Currently libc++'s
typedef-names are causing ambiguity in name lookup, which isn't
conforming.

As these iterator types are themselves useful, I think we should just
use __uglified member typedef-names for them.

Fixes #111125
---
 libcxx/docs/ReleaseNotes/20.rst               |  4 ++
 libcxx/include/bitset                         | 44 +++++++++---------
 .../nonstdmem.uglified.compile.pass.cpp       | 46 +++++++++++++++++++
 3 files changed, 72 insertions(+), 22 deletions(-)
 create mode 100644 libcxx/test/std/utilities/template.bitset/bitset.members/nonstdmem.uglified.compile.pass.cpp

diff --git a/libcxx/docs/ReleaseNotes/20.rst b/libcxx/docs/ReleaseNotes/20.rst
index dcb1102d81d641..3a66aecaf57cb2 100644
--- a/libcxx/docs/ReleaseNotes/20.rst
+++ b/libcxx/docs/ReleaseNotes/20.rst
@@ -78,6 +78,10 @@ Deprecations and Removals
   supported as an extension anymore, please migrate any code that uses e.g. ``std::vector<const T>`` to be
   standards conforming.
 
+- Non-conforming member typedefs ``iterator`` and ``const_iterator`` of ``std::bitset`` are removed. Previously, they
+  were private but could cause ambiguity in name lookup. Code that expects such ambiguity will possibly not compile in
+  LLVM 20.
+
 Upcoming Deprecations and Removals
 ----------------------------------
 
diff --git a/libcxx/include/bitset b/libcxx/include/bitset
index ce23d522168c4c..f90ceaab816cca 100644
--- a/libcxx/include/bitset
+++ b/libcxx/include/bitset
@@ -187,8 +187,8 @@ protected:
 
   typedef __bit_reference<__bitset> reference;
   typedef __bit_const_reference<__bitset> const_reference;
-  typedef __bit_iterator<__bitset, false> iterator;
-  typedef __bit_iterator<__bitset, true> const_iterator;
+  typedef __bit_iterator<__bitset, false> __iterator;
+  typedef __bit_iterator<__bitset, true> __const_iterator;
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __bitset() _NOEXCEPT;
   _LIBCPP_HIDE_FROM_ABI explicit _LIBCPP_CONSTEXPR __bitset(unsigned long long __v) _NOEXCEPT;
@@ -199,11 +199,11 @@ protected:
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR const_reference __make_ref(size_t __pos) const _NOEXCEPT {
     return const_reference(__first_ + __pos / __bits_per_word, __storage_type(1) << __pos % __bits_per_word);
   }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 iterator __make_iter(size_t __pos) _NOEXCEPT {
-    return iterator(__first_ + __pos / __bits_per_word, __pos % __bits_per_word);
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 __iterator __make_iter(size_t __pos) _NOEXCEPT {
+    return __iterator(__first_ + __pos / __bits_per_word, __pos % __bits_per_word);
   }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 const_iterator __make_iter(size_t __pos) const _NOEXCEPT {
-    return const_iterator(__first_ + __pos / __bits_per_word, __pos % __bits_per_word);
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 __const_iterator __make_iter(size_t __pos) const _NOEXCEPT {
+    return __const_iterator(__first_ + __pos / __bits_per_word, __pos % __bits_per_word);
   }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 void operator&=(const __bitset& __v) _NOEXCEPT;
@@ -335,8 +335,8 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 void __bitset<_N_words, _Siz
 template <size_t _N_words, size_t _Size>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unsigned long
 __bitset<_N_words, _Size>::to_ulong(false_type) const {
-  const_iterator __e = __make_iter(_Size);
-  const_iterator __i = std::find(__make_iter(sizeof(unsigned long) * CHAR_BIT), __e, true);
+  __const_iterator __e = __make_iter(_Size);
+  __const_iterator __i = std::find(__make_iter(sizeof(unsigned long) * CHAR_BIT), __e, true);
   if (__i != __e)
     __throw_overflow_error("bitset to_ulong overflow error");
 
@@ -352,8 +352,8 @@ __bitset<_N_words, _Size>::to_ulong(true_type) const {
 template <size_t _N_words, size_t _Size>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 unsigned long long
 __bitset<_N_words, _Size>::to_ullong(false_type) const {
-  const_iterator __e = __make_iter(_Size);
-  const_iterator __i = std::find(__make_iter(sizeof(unsigned long long) * CHAR_BIT), __e, true);
+  __const_iterator __e = __make_iter(_Size);
+  __const_iterator __i = std::find(__make_iter(sizeof(unsigned long long) * CHAR_BIT), __e, true);
   if (__i != __e)
     __throw_overflow_error("bitset to_ullong overflow error");
 
@@ -449,8 +449,8 @@ protected:
 
   typedef __bit_reference<__bitset> reference;
   typedef __bit_const_reference<__bitset> const_reference;
-  typedef __bit_iterator<__bitset, false> iterator;
-  typedef __bit_iterator<__bitset, true> const_iterator;
+  typedef __bit_iterator<__bitset, false> __iterator;
+  typedef __bit_iterator<__bitset, true> __const_iterator;
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __bitset() _NOEXCEPT;
   _LIBCPP_HIDE_FROM_ABI explicit _LIBCPP_CONSTEXPR __bitset(unsigned long long __v) _NOEXCEPT;
@@ -461,11 +461,11 @@ protected:
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR const_reference __make_ref(size_t __pos) const _NOEXCEPT {
     return const_reference(&__first_, __storage_type(1) << __pos);
   }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 iterator __make_iter(size_t __pos) _NOEXCEPT {
-    return iterator(&__first_ + __pos / __bits_per_word, __pos % __bits_per_word);
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 __iterator __make_iter(size_t __pos) _NOEXCEPT {
+    return __iterator(&__first_ + __pos / __bits_per_word, __pos % __bits_per_word);
   }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 const_iterator __make_iter(size_t __pos) const _NOEXCEPT {
-    return const_iterator(&__first_ + __pos / __bits_per_word, __pos % __bits_per_word);
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 __const_iterator __make_iter(size_t __pos) const _NOEXCEPT {
+    return __const_iterator(&__first_ + __pos / __bits_per_word, __pos % __bits_per_word);
   }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 void operator&=(const __bitset& __v) _NOEXCEPT;
@@ -564,8 +564,8 @@ protected:
 
   typedef __bit_reference<__bitset> reference;
   typedef __bit_const_reference<__bitset> const_reference;
-  typedef __bit_iterator<__bitset, false> iterator;
-  typedef __bit_iterator<__bitset, true> const_iterator;
+  typedef __bit_iterator<__bitset, false> __iterator;
+  typedef __bit_iterator<__bitset, true> __const_iterator;
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __bitset() _NOEXCEPT;
   _LIBCPP_HIDE_FROM_ABI explicit _LIBCPP_CONSTEXPR __bitset(unsigned long long) _NOEXCEPT;
@@ -576,11 +576,11 @@ protected:
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR const_reference __make_ref(size_t) const _NOEXCEPT {
     return const_reference(nullptr, 1);
   }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 iterator __make_iter(size_t) _NOEXCEPT {
-    return iterator(nullptr, 0);
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 __iterator __make_iter(size_t) _NOEXCEPT {
+    return __iterator(nullptr, 0);
   }
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 const_iterator __make_iter(size_t) const _NOEXCEPT {
-    return const_iterator(nullptr, 0);
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 __const_iterator __make_iter(size_t) const _NOEXCEPT {
+    return __const_iterator(nullptr, 0);
   }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 void operator&=(const __bitset&) _NOEXCEPT {}
diff --git a/libcxx/test/std/utilities/template.bitset/bitset.members/nonstdmem.uglified.compile.pass.cpp b/libcxx/test/std/utilities/template.bitset/bitset.members/nonstdmem.uglified.compile.pass.cpp
new file mode 100644
index 00000000000000..c9dd923d7130f5
--- /dev/null
+++ b/libcxx/test/std/utilities/template.bitset/bitset.members/nonstdmem.uglified.compile.pass.cpp
@@ -0,0 +1,46 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <bitset>
+
+// This test ensures that we don't use a non-uglified name 'iterator' and
+// 'const_iterator' in the implementation of bitset.
+//
+// See https://github.com/llvm/llvm-project/issues/111125.
+
+#include <cstddef>
+#include <bitset>
+#include <type_traits>
+
+struct my_base {
+  typedef int* iterator;
+  typedef const int* const_iterator;
+};
+
+template <std::size_t N>
+struct my_derived : my_base, std::bitset<N> {};
+
+static_assert(std::is_same<my_derived<0>::iterator, int*>::value, "");
+static_assert(std::is_same<my_derived<1>::iterator, int*>::value, "");
+static_assert(std::is_same<my_derived<8>::iterator, int*>::value, "");
+static_assert(std::is_same<my_derived<12>::iterator, int*>::value, "");
+static_assert(std::is_same<my_derived<16>::iterator, int*>::value, "");
+static_assert(std::is_same<my_derived<32>::iterator, int*>::value, "");
+static_assert(std::is_same<my_derived<48>::iterator, int*>::value, "");
+static_assert(std::is_same<my_derived<64>::iterator, int*>::value, "");
+static_assert(std::is_same<my_derived<96>::iterator, int*>::value, "");
+
+static_assert(std::is_same<my_derived<0>::const_iterator, const int*>::value, "");
+static_assert(std::is_same<my_derived<1>::const_iterator, const int*>::value, "");
+static_assert(std::is_same<my_derived<8>::const_iterator, const int*>::value, "");
+static_assert(std::is_same<my_derived<12>::const_iterator, const int*>::value, "");
+static_assert(std::is_same<my_derived<16>::const_iterator, const int*>::value, "");
+static_assert(std::is_same<my_derived<32>::const_iterator, const int*>::value, "");
+static_assert(std::is_same<my_derived<48>::const_iterator, const int*>::value, "");
+static_assert(std::is_same<my_derived<64>::const_iterator, const int*>::value, "");
+static_assert(std::is_same<my_derived<96>::const_iterator, const int*>::value, "");

From 90149204bd08c07eb672cd5b19d782fed3d96ddc Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Thu, 10 Oct 2024 14:26:46 +0100
Subject: [PATCH 012/177] [ci] Don't add check-all target when pstl project is
 enabled (#111803)

Fixes #110265

Adding check-all causes us to run some tests twice if a project specific
target like check-clang is also added.

check-pstl is an alternative but as far as I can tell, check-all does
not include this so we have not been running the tests in CI anyway.

When I tried to run check-pstl locally I got a lot of compiler errors
but have not found any instructions on how to setup a correct build
environment. Even if such instructions exist, it's probably more than we
want to do in CI.

According to Louis Dionne, the project is probably not active. So if
it's ever revived it'll be up to the new contributors to enable testing.
---
 .ci/generate-buildkite-pipeline-premerge | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.ci/generate-buildkite-pipeline-premerge b/.ci/generate-buildkite-pipeline-premerge
index 53a43070bf1ca3..7676ff716c4185 100755
--- a/.ci/generate-buildkite-pipeline-premerge
+++ b/.ci/generate-buildkite-pipeline-premerge
@@ -191,6 +191,9 @@ function keep-modified-projects() {
 }
 
 function check-targets() {
+  # Do not use "check-all" here because if there is "check-all" plus a
+  # project specific target like "check-clang", that project's tests
+  # will be run twice.
   projects=${@}
   for project in ${projects}; do
     case ${project} in
@@ -216,7 +219,7 @@ function check-targets() {
       echo "check-lldb"
     ;;
     pstl)
-      echo "check-all"
+      # Currently we do not run pstl tests in CI.
     ;;
     libclc)
       # Currently there is no testing for libclc.

From 480e7f0667794822f7f3a065bed73d9a2ecc2d58 Mon Sep 17 00:00:00 2001
From: jeanPerier <jperier@nvidia.com>
Date: Thu, 10 Oct 2024 15:37:19 +0200
Subject: [PATCH 013/177] [flang] correctly deal with bind(c) derived type
 result ABI (#111678)

Derived type results of BIND(C) function should be returned according
the the C ABI for returning the related C struct type.

This currently did not happen since the abstract-result pass was forcing
the Fortran ABI for all derived type results.
use the bind_c attribute that was added on call/func/dispatch in FIR to
prevent such rewrite in the abstract result pass, and update the
target-rewrite pass to deal with the struct return ABI.

So far, the target specific part of the target-rewrite is only
implemented for X86-64 according to the "System V Application Binary
Interface AMD64 v1", the other targets will hit a TODO, just like for
BIND(C), VALUE derived type arguments.

This intends to deal with
https://github.com/llvm/llvm-project/issues/102113.
---
 .../include/flang/Optimizer/CodeGen/Target.h  |   5 +
 .../flang/Optimizer/Dialect/FIROpsSupport.h   |  21 +++
 flang/lib/Optimizer/CodeGen/Target.cpp        |  68 ++++++++-
 flang/lib/Optimizer/CodeGen/TargetRewrite.cpp | 137 ++++++++++++++----
 .../Optimizer/Transforms/AbstractResult.cpp   |  65 ++++++++-
 flang/test/Fir/abstract-results-bindc.fir     |  43 ++++++
 flang/test/Fir/struct-return-x86-64.fir       | 120 +++++++++++++++
 7 files changed, 419 insertions(+), 40 deletions(-)
 create mode 100644 flang/test/Fir/abstract-results-bindc.fir
 create mode 100644 flang/test/Fir/struct-return-x86-64.fir

diff --git a/flang/include/flang/Optimizer/CodeGen/Target.h b/flang/include/flang/Optimizer/CodeGen/Target.h
index a7161152a5c323..3b38583511927a 100644
--- a/flang/include/flang/Optimizer/CodeGen/Target.h
+++ b/flang/include/flang/Optimizer/CodeGen/Target.h
@@ -126,6 +126,11 @@ class CodeGenSpecifics {
   structArgumentType(mlir::Location loc, fir::RecordType recTy,
                      const Marshalling &previousArguments) const = 0;
 
+  /// Type representation of a `fir.type<T>` type argument when returned by
+  /// value. Such value may need to be converted to a hidden reference argument.
+  virtual Marshalling structReturnType(mlir::Location loc,
+                                       fir::RecordType eleTy) const = 0;
+
   /// Type representation of a `boxchar<n>` type argument when passed by value.
   /// An argument value may need to be passed as a (safe) reference argument.
   ///
diff --git a/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h b/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h
index cdbefdb2341485..fb7b1d16f62f3a 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h
+++ b/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h
@@ -177,6 +177,27 @@ inline mlir::NamedAttribute getAdaptToByRefAttr(Builder &builder) {
 }
 
 bool isDummyArgument(mlir::Value v);
+
+template <fir::FortranProcedureFlagsEnum Flag>
+inline bool hasProcedureAttr(fir::FortranProcedureFlagsEnumAttr flags) {
+  return flags && bitEnumContainsAny(flags.getValue(), Flag);
+}
+
+template <fir::FortranProcedureFlagsEnum Flag>
+inline bool hasProcedureAttr(mlir::Operation *op) {
+  if (auto firCallOp = mlir::dyn_cast<fir::CallOp>(op))
+    return hasProcedureAttr<Flag>(firCallOp.getProcedureAttrsAttr());
+  if (auto firCallOp = mlir::dyn_cast<fir::DispatchOp>(op))
+    return hasProcedureAttr<Flag>(firCallOp.getProcedureAttrsAttr());
+  return hasProcedureAttr<Flag>(
+      op->getAttrOfType<fir::FortranProcedureFlagsEnumAttr>(
+          getFortranProcedureFlagsAttrName()));
+}
+
+inline bool hasBindcAttr(mlir::Operation *op) {
+  return hasProcedureAttr<fir::FortranProcedureFlagsEnum::bind_c>(op);
+}
+
 } // namespace fir
 
 #endif // FORTRAN_OPTIMIZER_DIALECT_FIROPSSUPPORT_H
diff --git a/flang/lib/Optimizer/CodeGen/Target.cpp b/flang/lib/Optimizer/CodeGen/Target.cpp
index a12b59413f4456..6c148dffb0e55a 100644
--- a/flang/lib/Optimizer/CodeGen/Target.cpp
+++ b/flang/lib/Optimizer/CodeGen/Target.cpp
@@ -100,6 +100,11 @@ struct GenericTarget : public CodeGenSpecifics {
     TODO(loc, "passing VALUE BIND(C) derived type for this target");
   }
 
+  CodeGenSpecifics::Marshalling
+  structReturnType(mlir::Location loc, fir::RecordType ty) const override {
+    TODO(loc, "returning BIND(C) derived type for this target");
+  }
+
   CodeGenSpecifics::Marshalling
   integerArgumentType(mlir::Location loc,
                       mlir::IntegerType argTy) const override {
@@ -533,7 +538,8 @@ struct TargetX86_64 : public GenericTarget<TargetX86_64> {
   /// When \p recTy is a one field record type that can be passed
   /// like the field on its own, returns the field type. Returns
   /// a null type otherwise.
-  mlir::Type passAsFieldIfOneFieldStruct(fir::RecordType recTy) const {
+  mlir::Type passAsFieldIfOneFieldStruct(fir::RecordType recTy,
+                                         bool allowComplex = false) const {
     auto typeList = recTy.getTypeList();
     if (typeList.size() != 1)
       return {};
@@ -541,6 +547,8 @@ struct TargetX86_64 : public GenericTarget<TargetX86_64> {
     if (mlir::isa<mlir::FloatType, mlir::IntegerType, fir::LogicalType>(
             fieldType))
       return fieldType;
+    if (allowComplex && mlir::isa<mlir::ComplexType>(fieldType))
+      return fieldType;
     if (mlir::isa<fir::CharacterType>(fieldType)) {
       // Only CHARACTER(1) are expected in BIND(C) contexts, which is the only
       // contexts where derived type may be passed in registers.
@@ -593,7 +601,7 @@ struct TargetX86_64 : public GenericTarget<TargetX86_64> {
     postMerge(byteOffset, Lo, Hi);
     if (Lo == ArgClass::Memory || Lo == ArgClass::X87 ||
         Lo == ArgClass::ComplexX87)
-      return passOnTheStack(loc, recTy);
+      return passOnTheStack(loc, recTy, /*isResult=*/false);
     int neededIntRegisters = 0;
     int neededSSERegisters = 0;
     if (Lo == ArgClass::SSE)
@@ -609,7 +617,7 @@ struct TargetX86_64 : public GenericTarget<TargetX86_64> {
     // all in registers or all on the stack).
     if (!hasEnoughRegisters(loc, neededIntRegisters, neededSSERegisters,
                             previousArguments))
-      return passOnTheStack(loc, recTy);
+      return passOnTheStack(loc, recTy, /*isResult=*/false);
 
     if (auto fieldType = passAsFieldIfOneFieldStruct(recTy)) {
       CodeGenSpecifics::Marshalling marshal;
@@ -641,9 +649,57 @@ struct TargetX86_64 : public GenericTarget<TargetX86_64> {
     return marshal;
   }
 
+  CodeGenSpecifics::Marshalling
+  structReturnType(mlir::Location loc, fir::RecordType recTy) const override {
+    std::uint64_t byteOffset = 0;
+    ArgClass Lo, Hi;
+    Lo = Hi = ArgClass::NoClass;
+    byteOffset = classifyStruct(loc, recTy, byteOffset, Lo, Hi);
+    mlir::MLIRContext *context = recTy.getContext();
+    postMerge(byteOffset, Lo, Hi);
+    if (Lo == ArgClass::Memory)
+      return passOnTheStack(loc, recTy, /*isResult=*/true);
+
+    // Note that X87/ComplexX87 are passed in memory, but returned via %st0
+    // %st1 registers. Here, they are returned as fp80 or {fp80, fp80} by
+    // passAsFieldIfOneFieldStruct, and LLVM will use the expected registers.
+
+    // Note that {_Complex long double} is not 100% clear from an ABI
+    // perspective because the aggregate post merger rules say it should be
+    // passed in memory because it is bigger than 2 eight bytes. This has the
+    // funny effect of
+    // {_Complex long double} return to be dealt with differently than
+    // _Complex long double.
+
+    if (auto fieldType =
+            passAsFieldIfOneFieldStruct(recTy, /*allowComplex=*/true)) {
+      if (auto complexType = mlir::dyn_cast<mlir::ComplexType>(fieldType))
+        return complexReturnType(loc, complexType.getElementType());
+      CodeGenSpecifics::Marshalling marshal;
+      marshal.emplace_back(fieldType, AT{});
+      return marshal;
+    }
+
+    if (Hi == ArgClass::NoClass || Hi == ArgClass::SSEUp) {
+      // Return a single integer or floating point argument.
+      mlir::Type lowType = pickLLVMArgType(loc, context, Lo, byteOffset);
+      CodeGenSpecifics::Marshalling marshal;
+      marshal.emplace_back(lowType, AT{});
+      return marshal;
+    }
+    // Will be returned in two different registers. Generate {lowTy, HiTy} for
+    // the LLVM IR result type.
+    CodeGenSpecifics::Marshalling marshal;
+    mlir::Type lowType = pickLLVMArgType(loc, context, Lo, 8u);
+    mlir::Type hiType = pickLLVMArgType(loc, context, Hi, byteOffset - 8u);
+    marshal.emplace_back(mlir::TupleType::get(context, {lowType, hiType}),
+                         AT{});
+    return marshal;
+  }
+
   /// Marshal an argument that must be passed on the stack.
-  CodeGenSpecifics::Marshalling passOnTheStack(mlir::Location loc,
-                                               mlir::Type ty) const {
+  CodeGenSpecifics::Marshalling
+  passOnTheStack(mlir::Location loc, mlir::Type ty, bool isResult) const {
     CodeGenSpecifics::Marshalling marshal;
     auto sizeAndAlign =
         fir::getTypeSizeAndAlignmentOrCrash(loc, ty, getDataLayout(), kindMap);
@@ -651,7 +707,7 @@ struct TargetX86_64 : public GenericTarget<TargetX86_64> {
     unsigned short align =
         std::max(sizeAndAlign.second, static_cast<unsigned short>(8));
     marshal.emplace_back(fir::ReferenceType::get(ty),
-                         AT{align, /*byval=*/true, /*sret=*/false});
+                         AT{align, /*byval=*/!isResult, /*sret=*/isResult});
     return marshal;
   }
 };
diff --git a/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp b/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp
index fd56fd6bf50f44..04a3ea684642c8 100644
--- a/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp
+++ b/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp
@@ -142,20 +142,16 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase<TargetRewrite> {
 
   mlir::ModuleOp getModule() { return getOperation(); }
 
-  template <typename A, typename B, typename C>
+  template <typename Ty, typename Callback>
   std::optional<std::function<mlir::Value(mlir::Operation *)>>
-  rewriteCallComplexResultType(
-      mlir::Location loc, A ty, B &newResTys,
-      fir::CodeGenSpecifics::Marshalling &newInTyAndAttrs, C &newOpers,
-      mlir::Value &savedStackPtr) {
-    if (noComplexConversion) {
-      newResTys.push_back(ty);
-      return std::nullopt;
-    }
-    auto m = specifics->complexReturnType(loc, ty.getElementType());
-    // Currently targets mandate COMPLEX is a single aggregate or packed
-    // scalar, including the sret case.
-    assert(m.size() == 1 && "target of complex return not supported");
+  rewriteCallResultType(mlir::Location loc, mlir::Type originalResTy,
+                        Ty &newResTys,
+                        fir::CodeGenSpecifics::Marshalling &newInTyAndAttrs,
+                        Callback &newOpers, mlir::Value &savedStackPtr,
+                        fir::CodeGenSpecifics::Marshalling &m) {
+    // Currently, targets mandate COMPLEX or STRUCT is a single aggregate or
+    // packed scalar, including the sret case.
+    assert(m.size() == 1 && "return type not supported on this target");
     auto resTy = std::get<mlir::Type>(m[0]);
     auto attr = std::get<fir::CodeGenSpecifics::Attributes>(m[0]);
     if (attr.isSRet()) {
@@ -170,7 +166,7 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase<TargetRewrite> {
       newInTyAndAttrs.push_back(m[0]);
       newOpers.push_back(stack);
       return [=](mlir::Operation *) -> mlir::Value {
-        auto memTy = fir::ReferenceType::get(ty);
+        auto memTy = fir::ReferenceType::get(originalResTy);
         auto cast = rewriter->create<fir::ConvertOp>(loc, memTy, stack);
         return rewriter->create<fir::LoadOp>(loc, cast);
       };
@@ -180,11 +176,41 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase<TargetRewrite> {
       // We are going to generate an alloca, so save the stack pointer.
       if (!savedStackPtr)
         savedStackPtr = genStackSave(loc);
-      return this->convertValueInMemory(loc, call->getResult(0), ty,
+      return this->convertValueInMemory(loc, call->getResult(0), originalResTy,
                                         /*inputMayBeBigger=*/true);
     };
   }
 
+  template <typename Ty, typename Callback>
+  std::optional<std::function<mlir::Value(mlir::Operation *)>>
+  rewriteCallComplexResultType(
+      mlir::Location loc, mlir::ComplexType ty, Ty &newResTys,
+      fir::CodeGenSpecifics::Marshalling &newInTyAndAttrs, Callback &newOpers,
+      mlir::Value &savedStackPtr) {
+    if (noComplexConversion) {
+      newResTys.push_back(ty);
+      return std::nullopt;
+    }
+    auto m = specifics->complexReturnType(loc, ty.getElementType());
+    return rewriteCallResultType(loc, ty, newResTys, newInTyAndAttrs, newOpers,
+                                 savedStackPtr, m);
+  }
+
+  template <typename Ty, typename Callback>
+  std::optional<std::function<mlir::Value(mlir::Operation *)>>
+  rewriteCallStructResultType(
+      mlir::Location loc, fir::RecordType recTy, Ty &newResTys,
+      fir::CodeGenSpecifics::Marshalling &newInTyAndAttrs, Callback &newOpers,
+      mlir::Value &savedStackPtr) {
+    if (noStructConversion) {
+      newResTys.push_back(recTy);
+      return std::nullopt;
+    }
+    auto m = specifics->structReturnType(loc, recTy);
+    return rewriteCallResultType(loc, recTy, newResTys, newInTyAndAttrs,
+                                 newOpers, savedStackPtr, m);
+  }
+
   void passArgumentOnStackOrWithNewType(
       mlir::Location loc, fir::CodeGenSpecifics::TypeAndAttr newTypeAndAttr,
       mlir::Type oldType, mlir::Value oper,
@@ -356,6 +382,11 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase<TargetRewrite> {
                                                 newInTyAndAttrs, newOpers,
                                                 savedStackPtr);
           })
+          .template Case<fir::RecordType>([&](fir::RecordType recTy) {
+            wrap = rewriteCallStructResultType(loc, recTy, newResTys,
+                                               newInTyAndAttrs, newOpers,
+                                               savedStackPtr);
+          })
           .Default([&](mlir::Type ty) { newResTys.push_back(ty); });
     } else if (fnTy.getResults().size() > 1) {
       TODO(loc, "multiple results not supported yet");
@@ -562,6 +593,24 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase<TargetRewrite> {
     }
   }
 
+  template <typename Ty>
+  void
+  lowerStructSignatureRes(mlir::Location loc, fir::RecordType recTy,
+                          Ty &newResTys,
+                          fir::CodeGenSpecifics::Marshalling &newInTyAndAttrs) {
+    if (noComplexConversion) {
+      newResTys.push_back(recTy);
+      return;
+    } else {
+      for (auto &tup : specifics->structReturnType(loc, recTy)) {
+        if (std::get<fir::CodeGenSpecifics::Attributes>(tup).isSRet())
+          newInTyAndAttrs.push_back(tup);
+        else
+          newResTys.push_back(std::get<mlir::Type>(tup));
+      }
+    }
+  }
+
   void
   lowerStructSignatureArg(mlir::Location loc, fir::RecordType recTy,
                           fir::CodeGenSpecifics::Marshalling &newInTyAndAttrs) {
@@ -595,6 +644,9 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase<TargetRewrite> {
           .Case<mlir::ComplexType>([&](mlir::ComplexType ty) {
             lowerComplexSignatureRes(loc, ty, newResTys, newInTyAndAttrs);
           })
+          .Case<fir::RecordType>([&](fir::RecordType ty) {
+            lowerStructSignatureRes(loc, ty, newResTys, newInTyAndAttrs);
+          })
           .Default([&](mlir::Type ty) { newResTys.push_back(ty); });
     }
     llvm::SmallVector<mlir::Type> trailingInTys;
@@ -696,7 +748,8 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase<TargetRewrite> {
     for (auto ty : func.getResults())
       if ((mlir::isa<fir::BoxCharType>(ty) && !noCharacterConversion) ||
           (fir::isa_complex(ty) && !noComplexConversion) ||
-          (mlir::isa<mlir::IntegerType>(ty) && hasCCallingConv)) {
+          (mlir::isa<mlir::IntegerType>(ty) && hasCCallingConv) ||
+          (mlir::isa<fir::RecordType>(ty) && !noStructConversion)) {
         LLVM_DEBUG(llvm::dbgs() << "rewrite " << signature << " for target\n");
         return false;
       }
@@ -770,6 +823,9 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase<TargetRewrite> {
                                                 rewriter->getUnitAttr()));
             newResTys.push_back(retTy);
           })
+          .Case<fir::RecordType>([&](fir::RecordType recTy) {
+            doStructReturn(func, recTy, newResTys, newInTyAndAttrs, fixups);
+          })
           .Default([&](mlir::Type ty) { newResTys.push_back(ty); });
 
     // Saved potential shift in argument. Handling of result can add arguments
@@ -1062,21 +1118,12 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase<TargetRewrite> {
     return false;
   }
 
-  /// Convert a complex return value. This can involve converting the return
-  /// value to a "hidden" first argument or packing the complex into a wide
-  /// GPR.
   template <typename Ty, typename FIXUPS>
-  void doComplexReturn(mlir::func::FuncOp func, mlir::ComplexType cmplx,
-                       Ty &newResTys,
-                       fir::CodeGenSpecifics::Marshalling &newInTyAndAttrs,
-                       FIXUPS &fixups) {
-    if (noComplexConversion) {
-      newResTys.push_back(cmplx);
-      return;
-    }
-    auto m =
-        specifics->complexReturnType(func.getLoc(), cmplx.getElementType());
-    assert(m.size() == 1);
+  void doReturn(mlir::func::FuncOp func, Ty &newResTys,
+                fir::CodeGenSpecifics::Marshalling &newInTyAndAttrs,
+                FIXUPS &fixups, fir::CodeGenSpecifics::Marshalling &m) {
+    assert(m.size() == 1 &&
+           "expect result to be turned into single argument or result so far");
     auto &tup = m[0];
     auto attr = std::get<fir::CodeGenSpecifics::Attributes>(tup);
     auto argTy = std::get<mlir::Type>(tup);
@@ -1117,6 +1164,36 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase<TargetRewrite> {
     newResTys.push_back(argTy);
   }
 
+  /// Convert a complex return value. This can involve converting the return
+  /// value to a "hidden" first argument or packing the complex into a wide
+  /// GPR.
+  template <typename Ty, typename FIXUPS>
+  void doComplexReturn(mlir::func::FuncOp func, mlir::ComplexType cmplx,
+                       Ty &newResTys,
+                       fir::CodeGenSpecifics::Marshalling &newInTyAndAttrs,
+                       FIXUPS &fixups) {
+    if (noComplexConversion) {
+      newResTys.push_back(cmplx);
+      return;
+    }
+    auto m =
+        specifics->complexReturnType(func.getLoc(), cmplx.getElementType());
+    doReturn(func, newResTys, newInTyAndAttrs, fixups, m);
+  }
+
+  template <typename Ty, typename FIXUPS>
+  void doStructReturn(mlir::func::FuncOp func, fir::RecordType recTy,
+                      Ty &newResTys,
+                      fir::CodeGenSpecifics::Marshalling &newInTyAndAttrs,
+                      FIXUPS &fixups) {
+    if (noStructConversion) {
+      newResTys.push_back(recTy);
+      return;
+    }
+    auto m = specifics->structReturnType(func.getLoc(), recTy);
+    doReturn(func, newResTys, newInTyAndAttrs, fixups, m);
+  }
+
   template <typename FIXUPS>
   void
   createFuncOpArgFixups(mlir::func::FuncOp func,
diff --git a/flang/lib/Optimizer/Transforms/AbstractResult.cpp b/flang/lib/Optimizer/Transforms/AbstractResult.cpp
index 7299ff80121e13..c0ec820d87ed44 100644
--- a/flang/lib/Optimizer/Transforms/AbstractResult.cpp
+++ b/flang/lib/Optimizer/Transforms/AbstractResult.cpp
@@ -32,6 +32,33 @@ using namespace mlir;
 namespace fir {
 namespace {
 
+// Helper to only build the symbol table if needed because its build time is
+// linear on the number of symbols in the module.
+struct LazySymbolTable {
+  LazySymbolTable(mlir::Operation *op)
+      : module{op->getParentOfType<mlir::ModuleOp>()} {}
+  void build() {
+    if (table)
+      return;
+    table = std::make_unique<mlir::SymbolTable>(module);
+  }
+
+  template <typename T>
+  T lookup(llvm::StringRef name) {
+    build();
+    return table->lookup<T>(name);
+  }
+
+private:
+  std::unique_ptr<mlir::SymbolTable> table;
+  mlir::ModuleOp module;
+};
+
+bool hasScalarDerivedResult(mlir::FunctionType funTy) {
+  return funTy.getNumResults() == 1 &&
+         mlir::isa<fir::RecordType>(funTy.getResult(0));
+}
+
 static mlir::Type getResultArgumentType(mlir::Type resultType,
                                         bool shouldBoxResult) {
   return llvm::TypeSwitch<mlir::Type, mlir::Type>(resultType)
@@ -190,7 +217,14 @@ class SaveResultOpConversion
   llvm::LogicalResult
   matchAndRewrite(fir::SaveResultOp op,
                   mlir::PatternRewriter &rewriter) const override {
-    rewriter.eraseOp(op);
+    mlir::Operation *call = op.getValue().getDefiningOp();
+    if (mlir::isa<fir::RecordType>(op.getValue().getType()) && call &&
+        fir::hasBindcAttr(call)) {
+      rewriter.replaceOpWithNewOp<fir::StoreOp>(op, op.getValue(),
+                                                op.getMemref());
+    } else {
+      rewriter.eraseOp(op);
+    }
     return mlir::success();
   }
 };
@@ -300,6 +334,12 @@ class AbstractResultOpt
     auto *context = &getContext();
     // Convert function type itself if it has an abstract result.
     auto funcTy = mlir::cast<mlir::FunctionType>(func.getFunctionType());
+    // Scalar derived result of BIND(C) function must be returned according
+    // to the C struct return ABI which is target dependent and implemented in
+    // the target-rewrite pass.
+    if (hasScalarDerivedResult(funcTy) &&
+        fir::hasBindcAttr(func.getOperation()))
+      return;
     if (hasAbstractResult(funcTy)) {
       if (fir::isa_builtin_cptr_type(funcTy.getResult(0))) {
         func.setType(getCPtrFunctionType(funcTy));
@@ -395,6 +435,8 @@ class AbstractResultOpt
       return;
     }
 
+    LazySymbolTable symbolTable(op);
+
     mlir::RewritePatternSet patterns(context);
     mlir::ConversionTarget target = *context;
     const bool shouldBoxResult = this->passResultAsBox.getValue();
@@ -409,14 +451,29 @@ class AbstractResultOpt
                            mlir::func::FuncDialect>();
     target.addIllegalOp<fir::SaveResultOp>();
     target.addDynamicallyLegalOp<fir::CallOp>([](fir::CallOp call) {
-      return !hasAbstractResult(call.getFunctionType());
+      mlir::FunctionType funTy = call.getFunctionType();
+      if (hasScalarDerivedResult(funTy) &&
+          fir::hasBindcAttr(call.getOperation()))
+        return true;
+      return !hasAbstractResult(funTy);
     });
-    target.addDynamicallyLegalOp<fir::AddrOfOp>([](fir::AddrOfOp addrOf) {
-      if (auto funTy = mlir::dyn_cast<mlir::FunctionType>(addrOf.getType()))
+    target.addDynamicallyLegalOp<fir::AddrOfOp>([&symbolTable](
+                                                    fir::AddrOfOp addrOf) {
+      if (auto funTy = mlir::dyn_cast<mlir::FunctionType>(addrOf.getType())) {
+        if (hasScalarDerivedResult(funTy)) {
+          auto func = symbolTable.lookup<mlir::func::FuncOp>(
+              addrOf.getSymbol().getRootReference().getValue());
+          return func && fir::hasBindcAttr(func.getOperation());
+        }
         return !hasAbstractResult(funTy);
+      }
       return true;
     });
     target.addDynamicallyLegalOp<fir::DispatchOp>([](fir::DispatchOp dispatch) {
+      mlir::FunctionType funTy = dispatch.getFunctionType();
+      if (hasScalarDerivedResult(funTy) &&
+          fir::hasBindcAttr(dispatch.getOperation()))
+        return true;
       return !hasAbstractResult(dispatch.getFunctionType());
     });
 
diff --git a/flang/test/Fir/abstract-results-bindc.fir b/flang/test/Fir/abstract-results-bindc.fir
new file mode 100644
index 00000000000000..9b26730f7d2923
--- /dev/null
+++ b/flang/test/Fir/abstract-results-bindc.fir
@@ -0,0 +1,43 @@
+// Test that bind_c derived type results are not moved to a hidden argument
+// by the abstract-result pass. They will be dealt with according to the C
+// struct returning ABI for the target in the target-rewrite pass.
+// RUN: fir-opt %s --abstract-result | FileCheck %s
+
+!t = !fir.type<t{i:f32, j: i32, k: f32}>
+
+func.func private @foo() -> !t attributes {fir.proc_attrs = #fir.proc_attrs<bind_c>}
+
+func.func @test_call(%x: !fir.ref<!t>) {
+  %0 = fir.call @foo() proc_attrs<bind_c> : () -> !t
+  fir.save_result %0 to %x : !t, !fir.ref<!t>
+  return
+}
+
+func.func @test_addr_of() -> (() -> !t) {
+  %0 = fir.address_of(@foo) : () -> !t
+  return %0 : () -> !t
+}
+
+func.func @test_dispatch(%x: !fir.ref<!t>, %y : !fir.class<!fir.type<somet>>) {
+  %0 = fir.dispatch "bar"(%y : !fir.class<!fir.type<somet>>) (%y : !fir.class<!fir.type<somet>>) -> !t proc_attrs<bind_c> {pass_arg_pos = 0 : i32}
+  fir.save_result %0 to %x : !t, !fir.ref<!t>
+  return
+}
+
+// CHECK-LABEL:   func.func @test_call(
+// CHECK-SAME:                         %[[VAL_0:.*]]: !fir.ref<!fir.type<t{i:f32,j:i32,k:f32}>>) {
+// CHECK:           %[[VAL_1:.*]] = fir.call @foo() proc_attrs<bind_c> : () -> !fir.type<t{i:f32,j:i32,k:f32}>
+// CHECK:           fir.store %[[VAL_1]] to %[[VAL_0]] : !fir.ref<!fir.type<t{i:f32,j:i32,k:f32}>>
+// CHECK:           return
+// CHECK:         }
+// CHECK-LABEL:   func.func @test_addr_of() -> (() -> !fir.type<t{i:f32,j:i32,k:f32}>) {
+// CHECK:           %[[VAL_0:.*]] = fir.address_of(@foo) : () -> !fir.type<t{i:f32,j:i32,k:f32}>
+// CHECK:           return %[[VAL_0]] : () -> !fir.type<t{i:f32,j:i32,k:f32}>
+// CHECK:         }
+// CHECK-LABEL:   func.func @test_dispatch(
+// CHECK-SAME:                             %[[VAL_0:.*]]: !fir.ref<!fir.type<t{i:f32,j:i32,k:f32}>>,
+// CHECK-SAME:                             %[[VAL_1:.*]]: !fir.class<!fir.type<somet>>) {
+// CHECK:           %[[VAL_2:.*]] = fir.dispatch "bar"(%[[VAL_1]] : !fir.class<!fir.type<somet>>) (%[[VAL_1]] : !fir.class<!fir.type<somet>>) -> !fir.type<t{i:f32,j:i32,k:f32}> proc_attrs <bind_c> {pass_arg_pos = 0 : i32}
+// CHECK:           fir.store %[[VAL_2]] to %[[VAL_0]] : !fir.ref<!fir.type<t{i:f32,j:i32,k:f32}>>
+// CHECK:           return
+// CHECK:         }
diff --git a/flang/test/Fir/struct-return-x86-64.fir b/flang/test/Fir/struct-return-x86-64.fir
new file mode 100644
index 00000000000000..f4c2add69ff7e9
--- /dev/null
+++ b/flang/test/Fir/struct-return-x86-64.fir
@@ -0,0 +1,120 @@
+// Test X86-64 ABI rewrite of struct returned by value (BIND(C), VALUE derived types).
+// REQUIRES: x86-registered-target
+// RUN: fir-opt --target-rewrite %s | FileCheck %s
+
+!fits_in_reg = !fir.type<t1{i:f32,j:i32,k:f32}>
+!too_big = !fir.type<t2{i:!fir.array<5xf32>}>
+
+module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+
+  func.func private @test_inreg() -> !fits_in_reg
+  func.func @test_call_inreg(%arg0: !fir.ref<!fits_in_reg>) {
+    %0 = fir.call @test_inreg() : () -> !fits_in_reg
+    fir.store %0 to %arg0 : !fir.ref<!fits_in_reg>
+    return
+  }
+  func.func @test_addr_of_inreg() -> (() -> ()) {
+    %0 = fir.address_of(@test_inreg) : () -> !fits_in_reg
+    %1 = fir.convert %0 : (() -> !fits_in_reg) -> (() -> ())
+    return %1 : () -> ()
+  }
+  func.func @test_dispatch_inreg(%arg0: !fir.ref<!fits_in_reg>, %arg1: !fir.class<!fir.type<somet>>) {
+    %0 = fir.dispatch "bar"(%arg1 : !fir.class<!fir.type<somet>>) (%arg1 : !fir.class<!fir.type<somet>>) -> !fits_in_reg {pass_arg_pos = 0 : i32}
+    fir.store %0 to %arg0 : !fir.ref<!fits_in_reg>
+    return
+  }
+
+  func.func private @test_sret() -> !too_big
+  func.func @test_call_sret(%arg0: !fir.ref<!too_big>) {
+    %0 = fir.call @test_sret() : () -> !too_big
+    fir.store %0 to %arg0 : !fir.ref<!too_big>
+    return
+  }
+  func.func @test_addr_of_sret() -> (() -> ()) {
+    %0 = fir.address_of(@test_sret) : () -> !too_big
+    %1 = fir.convert %0 : (() -> !too_big) -> (() -> ())
+    return %1 : () -> ()
+  }
+  func.func @test_dispatch_sret(%arg0: !fir.ref<!too_big>, %arg1: !fir.class<!fir.type<somet>>) {
+    %0 = fir.dispatch "bar"(%arg1 : !fir.class<!fir.type<somet>>) (%arg1 : !fir.class<!fir.type<somet>>) -> !too_big {pass_arg_pos = 0 : i32}
+    fir.store %0 to %arg0 : !fir.ref<!too_big>
+    return
+  }
+  func.func private @test_fp_80() -> !fir.type<t3{i:f80}>
+  func.func private @test_complex_80() -> !fir.type<t4{i:complex<f80>}>
+  func.func private @test_two_fp_80() -> !fir.type<t5{i:f80,j:f80}>
+  func.func private @test_fp128() -> !fir.type<t6{i:f128}>
+}
+
+// CHECK-LABEL:   func.func private @test_inreg() -> tuple<i64, f32>
+
+// CHECK-LABEL:   func.func @test_call_inreg(
+// CHECK-SAME:                               %[[VAL_0:.*]]: !fir.ref<!fir.type<t1{i:f32,j:i32,k:f32}>>) {
+// CHECK:           %[[VAL_1:.*]] = fir.call @test_inreg() : () -> tuple<i64, f32>
+// CHECK:           %[[VAL_2:.*]] = llvm.intr.stacksave : !llvm.ptr
+// CHECK:           %[[VAL_3:.*]] = fir.alloca tuple<i64, f32>
+// CHECK:           fir.store %[[VAL_1]] to %[[VAL_3]] : !fir.ref<tuple<i64, f32>>
+// CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<tuple<i64, f32>>) -> !fir.ref<!fir.type<t1{i:f32,j:i32,k:f32}>>
+// CHECK:           %[[VAL_5:.*]] = fir.load %[[VAL_4]] : !fir.ref<!fir.type<t1{i:f32,j:i32,k:f32}>>
+// CHECK:           llvm.intr.stackrestore %[[VAL_2]] : !llvm.ptr
+// CHECK:           fir.store %[[VAL_5]] to %[[VAL_0]] : !fir.ref<!fir.type<t1{i:f32,j:i32,k:f32}>>
+// CHECK:           return
+// CHECK:         }
+
+// CHECK-LABEL:   func.func @test_addr_of_inreg() -> (() -> ()) {
+// CHECK:           %[[VAL_0:.*]] = fir.address_of(@test_inreg) : () -> tuple<i64, f32>
+// CHECK:           %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (() -> tuple<i64, f32>) -> (() -> ())
+// CHECK:           return %[[VAL_1]] : () -> ()
+// CHECK:         }
+
+// CHECK-LABEL:   func.func @test_dispatch_inreg(
+// CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.ref<!fir.type<t1{i:f32,j:i32,k:f32}>>,
+// CHECK-SAME:                                   %[[VAL_1:.*]]: !fir.class<!fir.type<somet>>) {
+// CHECK:           %[[VAL_2:.*]] = fir.dispatch "bar"(%[[VAL_1]] : !fir.class<!fir.type<somet>>) (%[[VAL_1]] : !fir.class<!fir.type<somet>>) -> tuple<i64, f32> {pass_arg_pos = 0 : i32}
+// CHECK:           %[[VAL_3:.*]] = llvm.intr.stacksave : !llvm.ptr
+// CHECK:           %[[VAL_4:.*]] = fir.alloca tuple<i64, f32>
+// CHECK:           fir.store %[[VAL_2]] to %[[VAL_4]] : !fir.ref<tuple<i64, f32>>
+// CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.ref<tuple<i64, f32>>) -> !fir.ref<!fir.type<t1{i:f32,j:i32,k:f32}>>
+// CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_5]] : !fir.ref<!fir.type<t1{i:f32,j:i32,k:f32}>>
+// CHECK:           llvm.intr.stackrestore %[[VAL_3]] : !llvm.ptr
+// CHECK:           fir.store %[[VAL_6]] to %[[VAL_0]] : !fir.ref<!fir.type<t1{i:f32,j:i32,k:f32}>>
+// CHECK:           return
+// CHECK:         }
+// CHECK:         func.func private @test_sret(!fir.ref<!fir.type<t2{i:!fir.array<5xf32>}>> {llvm.align = 8 : i32, llvm.sret = !fir.type<t2{i:!fir.array<5xf32>}>})
+
+// CHECK-LABEL:   func.func @test_call_sret(
+// CHECK-SAME:                              %[[VAL_0:.*]]: !fir.ref<!fir.type<t2{i:!fir.array<5xf32>}>>) {
+// CHECK:           %[[VAL_1:.*]] = llvm.intr.stacksave : !llvm.ptr
+// CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.type<t2{i:!fir.array<5xf32>}>
+// CHECK:           fir.call @test_sret(%[[VAL_2]]) : (!fir.ref<!fir.type<t2{i:!fir.array<5xf32>}>>) -> ()
+// CHECK:           %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.type<t2{i:!fir.array<5xf32>}>>) -> !fir.ref<!fir.type<t2{i:!fir.array<5xf32>}>>
+// CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_3]] : !fir.ref<!fir.type<t2{i:!fir.array<5xf32>}>>
+// CHECK:           llvm.intr.stackrestore %[[VAL_1]] : !llvm.ptr
+// CHECK:           fir.store %[[VAL_4]] to %[[VAL_0]] : !fir.ref<!fir.type<t2{i:!fir.array<5xf32>}>>
+// CHECK:           return
+// CHECK:         }
+
+// CHECK-LABEL:   func.func @test_addr_of_sret() -> (() -> ()) {
+// CHECK:           %[[VAL_0:.*]] = fir.address_of(@test_sret) : (!fir.ref<!fir.type<t2{i:!fir.array<5xf32>}>>) -> ()
+// CHECK:           %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : ((!fir.ref<!fir.type<t2{i:!fir.array<5xf32>}>>) -> ()) -> (() -> ())
+// CHECK:           return %[[VAL_1]] : () -> ()
+// CHECK:         }
+
+// CHECK-LABEL:   func.func @test_dispatch_sret(
+// CHECK-SAME:                                  %[[VAL_0:.*]]: !fir.ref<!fir.type<t2{i:!fir.array<5xf32>}>>,
+// CHECK-SAME:                                  %[[VAL_1:.*]]: !fir.class<!fir.type<somet>>) {
+// CHECK:           %[[VAL_2:.*]] = llvm.intr.stacksave : !llvm.ptr
+// CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.type<t2{i:!fir.array<5xf32>}>
+// CHECK:           fir.dispatch "bar"(%[[VAL_1]] : !fir.class<!fir.type<somet>>) (%[[VAL_3]], %[[VAL_1]] : !fir.ref<!fir.type<t2{i:!fir.array<5xf32>}>>, !fir.class<!fir.type<somet>>) {pass_arg_pos = 1 : i32}
+// CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.type<t2{i:!fir.array<5xf32>}>>) -> !fir.ref<!fir.type<t2{i:!fir.array<5xf32>}>>
+// CHECK:           %[[VAL_5:.*]] = fir.load %[[VAL_4]] : !fir.ref<!fir.type<t2{i:!fir.array<5xf32>}>>
+// CHECK:           llvm.intr.stackrestore %[[VAL_2]] : !llvm.ptr
+// CHECK:           fir.store %[[VAL_5]] to %[[VAL_0]] : !fir.ref<!fir.type<t2{i:!fir.array<5xf32>}>>
+// CHECK:           return
+// CHECK:         }
+
+
+// CHECK: func.func private @test_fp_80() -> f80
+// CHECK: func.func private @test_complex_80(!fir.ref<!fir.type<t4{i:complex<f80>}>> {llvm.align = 16 : i32, llvm.sret = !fir.type<t4{i:complex<f80>}>})
+// CHECK: func.func private @test_two_fp_80(!fir.ref<!fir.type<t5{i:f80,j:f80}>> {llvm.align = 16 : i32, llvm.sret = !fir.type<t5{i:f80,j:f80}>})
+// CHECK: func.func private @test_fp128() -> f128

From 6779376ee917279b16e256839d236cfdf8fd9ab9 Mon Sep 17 00:00:00 2001
From: Stephen Tozer <stephen.tozer@sony.com>
Date: Thu, 10 Oct 2024 14:38:07 +0100
Subject: [PATCH 014/177] [Dexter] Remove outdated imp dependency (#111833)

Fixes: https://github.com/llvm/llvm-project/issues/111815

This patch replaces usage of the python `imp` library, which is
deprecated since python3.4 and removed in python3.12, with the
`importlib` library. As part of this update the repeated
find_module+load_module pattern is moved into a utility function, since
the importlib equivalent is much more verbose.
---
 .../dexter/dex/debugger/lldb/LLDB.py                |  5 ++---
 .../dex/debugger/visualstudio/VisualStudio.py       |  8 +++-----
 .../debuginfo-tests/dexter/dex/tools/Main.py        |  6 ++----
 .../debuginfo-tests/dexter/dex/tools/help/Tool.py   |  8 +++-----
 .../debuginfo-tests/dexter/dex/utils/Imports.py     | 13 +++++++++++++
 5 files changed, 23 insertions(+), 17 deletions(-)
 create mode 100644 cross-project-tests/debuginfo-tests/dexter/dex/utils/Imports.py

diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/lldb/LLDB.py b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/lldb/LLDB.py
index 2307550aca047b..e8bc65cd3fbe88 100644
--- a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/lldb/LLDB.py
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/lldb/LLDB.py
@@ -7,7 +7,6 @@
 """Interface for communicating with the LLDB debugger via its python interface.
 """
 
-import imp
 import os
 import shlex
 from subprocess import CalledProcessError, check_output, STDOUT
@@ -18,6 +17,7 @@
 from dex.dextIR import StackFrame, SourceLocation, ProgramState
 from dex.utils.Exceptions import DebuggerException, LoadDebuggerException
 from dex.utils.ReturnCode import ReturnCode
+from dex.utils.Imports import load_module
 
 
 class LLDB(DebuggerBase):
@@ -82,8 +82,7 @@ def _load_interface(self):
             )
 
         try:
-            module_info = imp.find_module("lldb", [pythonpath])
-            return imp.load_module("lldb", *module_info)
+            return load_module("lldb", pythonpath)
         except ImportError as e:
             msg = str(e)
             if msg.endswith("not a valid Win32 application."):
diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/visualstudio/VisualStudio.py b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/visualstudio/VisualStudio.py
index 17587b3f3e18d6..7cb56ec0c25a76 100644
--- a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/visualstudio/VisualStudio.py
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/visualstudio/VisualStudio.py
@@ -7,7 +7,6 @@
 """Interface for communicating with the Visual Studio debugger via DTE."""
 
 import abc
-import imp
 import os
 import sys
 from enum import IntEnum
@@ -19,15 +18,14 @@
 from dex.dextIR import FrameIR, LocIR, StepIR, StopReason, ValueIR
 from dex.dextIR import StackFrame, SourceLocation, ProgramState
 from dex.utils.Exceptions import Error, LoadDebuggerException
+from dex.utils.Imports import load_module
 from dex.utils.ReturnCode import ReturnCode
 
-
 def _load_com_module():
     try:
-        module_info = imp.find_module(
-            "ComInterface", [os.path.join(os.path.dirname(__file__), "windows")]
+        return load_module(
+            "ComInterface", os.path.join(os.path.dirname(__file__), "windows")
         )
-        return imp.load_module("ComInterface", *module_info)
     except ImportError as e:
         raise LoadDebuggerException(e, sys.exc_info())
 
diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/tools/Main.py b/cross-project-tests/debuginfo-tests/dexter/dex/tools/Main.py
index b6c146ad784062..512958d20f4bbc 100644
--- a/cross-project-tests/debuginfo-tests/dexter/dex/tools/Main.py
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/tools/Main.py
@@ -10,7 +10,6 @@
 subtool.
 """
 
-import imp
 import os
 import sys
 
@@ -18,6 +17,7 @@
 from dex.utils import ExtArgParse as argparse
 from dex.utils import get_root_directory
 from dex.utils.Exceptions import Error, ToolArgumentError
+from dex.utils.Imports import load_module
 from dex.utils.Logging import Logger
 from dex.utils.UnitTests import unit_tests_ok
 from dex.utils.Version import version
@@ -135,9 +135,7 @@ def _import_tool_module(tool_name):
     tool_name = tool_name.replace("-", "_")
 
     tools_directory = get_tools_directory()
-    module_info = imp.find_module(tool_name, [tools_directory])
-
-    return imp.load_module(tool_name, *module_info)
+    return load_module(tool_name, tools_directory)
 
 
 def tool_main(context, tool, args):
diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/tools/help/Tool.py b/cross-project-tests/debuginfo-tests/dexter/dex/tools/help/Tool.py
index 520bf9f59917af..44e0a0e65c4bac 100644
--- a/cross-project-tests/debuginfo-tests/dexter/dex/tools/help/Tool.py
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/tools/help/Tool.py
@@ -6,10 +6,10 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 """Help tool."""
 
-import imp
 import textwrap
 
 from dex.tools import ToolBase, get_tool_names, get_tools_directory, tool_main
+from dex.utils.Imports import load_module
 from dex.utils.ReturnCode import ReturnCode
 
 
@@ -39,8 +39,7 @@ def _default_text(self):
         tools_directory = get_tools_directory()
         for tool_name in sorted(self._visible_tool_names):
             internal_name = tool_name.replace("-", "_")
-            module_info = imp.find_module(internal_name, [tools_directory])
-            tool_doc = imp.load_module(internal_name, *module_info).Tool.__doc__
+            tool_doc = load_module(internal_name, tools_directory).Tool.__doc__
             tool_doc = tool_doc.strip() if tool_doc else ""
             tool_doc = textwrap.fill(" ".join(tool_doc.split()), 80)
             s += "<g>{}</>\n{}\n\n".format(tool_name, tool_doc)
@@ -53,6 +52,5 @@ def go(self) -> ReturnCode:
 
         tool_name = self.context.options.tool.replace("-", "_")
         tools_directory = get_tools_directory()
-        module_info = imp.find_module(tool_name, [tools_directory])
-        module = imp.load_module(tool_name, *module_info)
+        module = load_module(tool_name, tools_directory)
         return tool_main(self.context, module.Tool(self.context), ["--help"])
diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/utils/Imports.py b/cross-project-tests/debuginfo-tests/dexter/dex/utils/Imports.py
new file mode 100644
index 00000000000000..ea052c21a18498
--- /dev/null
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/utils/Imports.py
@@ -0,0 +1,13 @@
+import importlib
+import os
+import sys
+
+
+def load_module(name, path):
+    spec = importlib.util.spec_from_file_location(
+        name, os.path.join(path, name, "__init__.py")
+    )
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[name] = module
+    spec.loader.exec_module(module)
+    return module

From a3cd269fbebecb6971e216a9c29ad8933ad7b0fc Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Thu, 10 Oct 2024 21:40:24 +0800
Subject: [PATCH 015/177] [RISCV] Remove {s,u}int_to_fp custom op action for
 f16/bf16 (#111471)

It turns out that {s,u}int_to_fp nodes get their operation action from
their operand's type, not the result type, so we don't need to set it
for fp16 or bf16. vp_{s,u}int_to_fp uses the result type though so we
need to keep it.

This also means that we can lower int_to_fp for fixed length bf16
vectors already, so this adds tests for that.

The cost model test changes are due to BasicTTIImpl's getCastInstrCost
not taking into account that int_to_fp needs its legal type swapped.
This can be fixed in a later patch, but its worth noting that the
affected types in the tests currently crash when lowered anyway (due to
them needing split at LMUL > 8)
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |   9 +-
 .../Analysis/CostModel/RISCV/cast-half.ll     |   8 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll   |  66 +++++++++-
 .../CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll   | 124 +++++++++++++++++-
 4 files changed, 189 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 01fa418e4dbdf4..230ccd8209e1f2 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1071,9 +1071,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       setOperationAction({ISD::VP_MERGE, ISD::VP_SELECT, ISD::SELECT}, VT,
                          Custom);
       setOperationAction(ISD::SELECT_CC, VT, Expand);
-      setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::VP_SINT_TO_FP,
-                          ISD::VP_UINT_TO_FP},
-                         VT, Custom);
+      setOperationAction({ISD::VP_SINT_TO_FP, ISD::VP_UINT_TO_FP}, VT, Custom);
       setOperationAction({ISD::INSERT_VECTOR_ELT, ISD::CONCAT_VECTORS,
                           ISD::INSERT_SUBVECTOR, ISD::EXTRACT_SUBVECTOR,
                           ISD::VECTOR_DEINTERLEAVE, ISD::VECTOR_INTERLEAVE,
@@ -1343,9 +1341,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
           setOperationAction(
               {ISD::VP_MERGE, ISD::VP_SELECT, ISD::VSELECT, ISD::SELECT}, VT,
               Custom);
-          setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP,
-                              ISD::VP_SINT_TO_FP, ISD::VP_UINT_TO_FP},
-                             VT, Custom);
+          setOperationAction({ISD::VP_SINT_TO_FP, ISD::VP_UINT_TO_FP}, VT,
+                             Custom);
           setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
           if (Subtarget.hasStdExtZfhmin()) {
             setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
diff --git a/llvm/test/Analysis/CostModel/RISCV/cast-half.ll b/llvm/test/Analysis/CostModel/RISCV/cast-half.ll
index 84b5486eb2de1c..244c42cc94ba03 100644
--- a/llvm/test/Analysis/CostModel/RISCV/cast-half.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/cast-half.ll
@@ -842,7 +842,7 @@ define void @sitofp() {
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v64i64_v64f16 = sitofp <64 x i64> undef to <64 x half>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %v64i1_v64f16 = sitofp <64 x i1> undef to <64 x half>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %v128i8_v128f16 = sitofp <128 x i8> undef to <128 x half>
-; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i16_v128f16 = sitofp <128 x i16> undef to <128 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v128i16_v128f16 = sitofp <128 x i16> undef to <128 x half>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v128i32_v128f16 = sitofp <128 x i32> undef to <128 x half>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v128i64_v128f16 = sitofp <128 x i64> undef to <128 x half>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %v128i1_v128f16 = sitofp <128 x i1> undef to <128 x half>
@@ -988,7 +988,7 @@ define void @sitofp() {
 ; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v64i64_v64f16 = sitofp <64 x i64> undef to <64 x half>
 ; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %v64i1_v64f16 = sitofp <64 x i1> undef to <64 x half>
 ; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %v128i8_v128f16 = sitofp <128 x i8> undef to <128 x half>
-; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i16_v128f16 = sitofp <128 x i16> undef to <128 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v128i16_v128f16 = sitofp <128 x i16> undef to <128 x half>
 ; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v128i32_v128f16 = sitofp <128 x i32> undef to <128 x half>
 ; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v128i64_v128f16 = sitofp <128 x i64> undef to <128 x half>
 ; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %v128i1_v128f16 = sitofp <128 x i1> undef to <128 x half>
@@ -1208,7 +1208,7 @@ define void @uitofp() {
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v64i64_v64f16 = uitofp <64 x i64> undef to <64 x half>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %v64i1_v64f16 = uitofp <64 x i1> undef to <64 x half>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %v128i8_v128f16 = uitofp <128 x i8> undef to <128 x half>
-; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i16_v128f16 = uitofp <128 x i16> undef to <128 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v128i16_v128f16 = uitofp <128 x i16> undef to <128 x half>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v128i32_v128f16 = uitofp <128 x i32> undef to <128 x half>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v128i64_v128f16 = uitofp <128 x i64> undef to <128 x half>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %v128i1_v128f16 = uitofp <128 x i1> undef to <128 x half>
@@ -1354,7 +1354,7 @@ define void @uitofp() {
 ; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v64i64_v64f16 = uitofp <64 x i64> undef to <64 x half>
 ; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %v64i1_v64f16 = uitofp <64 x i1> undef to <64 x half>
 ; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %v128i8_v128f16 = uitofp <128 x i8> undef to <128 x half>
-; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i16_v128f16 = uitofp <128 x i16> undef to <128 x half>
+; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v128i16_v128f16 = uitofp <128 x i16> undef to <128 x half>
 ; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v128i32_v128f16 = uitofp <128 x i32> undef to <128 x half>
 ; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v128i64_v128f16 = uitofp <128 x i64> undef to <128 x half>
 ; RV64ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %v128i1_v128f16 = uitofp <128 x i1> undef to <128 x half>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll
index bfcc7017178e31..a4a491989c7f02 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,ZVFH32
-; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,ZVFH64
-; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfhmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,ZVFHMIN32
-; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfhmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,ZVFHMIN64
+; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfh,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,ZVFH32
+; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfh,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,ZVFH64
+; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfhmin,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,ZVFHMIN32
+; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfhmin,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,ZVFHMIN64
 
 define void @fp2si_v2f32_v2i32(ptr %x, ptr %y) {
 ; CHECK-LABEL: fp2si_v2f32_v2i32:
@@ -432,6 +432,64 @@ define void @fp2ui_v8f32_v8i64(ptr %x, ptr %y) {
   ret void
 }
 
+define void @fp2si_v2bf16_v2i64(ptr %x, ptr %y) {
+; CHECK-LABEL: fp2si_v2bf16_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vfwcvt.rtz.x.f.v v8, v9
+; CHECK-NEXT:    vse64.v v8, (a1)
+; CHECK-NEXT:    ret
+  %a = load <2 x bfloat>, ptr %x
+  %d = fptosi <2 x bfloat> %a to <2 x i64>
+  store <2 x i64> %d, ptr %y
+  ret void
+}
+
+define void @fp2ui_v2bf16_v2i64(ptr %x, ptr %y) {
+; CHECK-LABEL: fp2ui_v2bf16_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v8, v9
+; CHECK-NEXT:    vse64.v v8, (a1)
+; CHECK-NEXT:    ret
+  %a = load <2 x bfloat>, ptr %x
+  %d = fptoui <2 x bfloat> %a to <2 x i64>
+  store <2 x i64> %d, ptr %y
+  ret void
+}
+
+define <2 x i1> @fp2si_v2bf16_v2i1(<2 x bfloat> %x) {
+; CHECK-LABEL: fp2si_v2bf16_v2i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    vfncvt.rtz.x.f.w v8, v9
+; CHECK-NEXT:    vand.vi v8, v8, 1
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    ret
+  %z = fptosi <2 x bfloat> %x to <2 x i1>
+  ret <2 x i1> %z
+}
+
+define <2 x i1> @fp2ui_v2bf16_v2i1(<2 x bfloat> %x) {
+; CHECK-LABEL: fp2ui_v2bf16_v2i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    vfncvt.rtz.xu.f.w v8, v9
+; CHECK-NEXT:    vand.vi v8, v8, 1
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    ret
+  %z = fptoui <2 x bfloat> %x to <2 x i1>
+  ret <2 x i1> %z
+}
+
 define void @fp2si_v2f16_v2i64(ptr %x, ptr %y) {
 ; CHECK-LABEL: fp2si_v2f16_v2i64:
 ; CHECK:       # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll
index 7333067e9205e0..9cdc9b81c9530a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,ZVFH32
-; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,ZVFH64
-; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfhmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,ZVFHMIN32
-; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfhmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,ZVFHMIN64
+; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfh,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,ZVFH32
+; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfh,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,ZVFH64
+; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfhmin,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,ZVFHMIN32
+; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfhmin,+zvfbfmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,ZVFHMIN64
 
 define void @si2fp_v2i32_v2f32(ptr %x, ptr %y) {
 ; CHECK-LABEL: si2fp_v2i32_v2f32:
@@ -418,6 +418,122 @@ define <8 x double> @ui2fp_v8i1_v8f64(<8 x i1> %x) {
   ret <8 x double> %z
 }
 
+define void @si2fp_v2i64_v2bf16(ptr %x, ptr %y) {
+; CHECK-LABEL: si2fp_v2i64_v2bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vfncvt.f.x.w v9, v8
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    vse16.v v8, (a1)
+; CHECK-NEXT:    ret
+  %a = load <2 x i64>, ptr %x
+  %d = sitofp <2 x i64> %a to <2 x bfloat>
+  store <2 x bfloat> %d, ptr %y
+  ret void
+}
+
+define void @ui2fp_v2i64_v2bf16(ptr %x, ptr %y) {
+; CHECK-LABEL: ui2fp_v2i64_v2bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vfncvt.f.xu.w v9, v8
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    vse16.v v8, (a1)
+; CHECK-NEXT:    ret
+  %a = load <2 x i64>, ptr %x
+  %d = uitofp <2 x i64> %a to <2 x bfloat>
+  store <2 x bfloat> %d, ptr %y
+  ret void
+}
+
+define <2 x bfloat> @si2fp_v2i1_v2bf16(<2 x i1> %x) {
+; CHECK-LABEL: si2fp_v2i1_v2bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
+; CHECK-NEXT:    vfwcvt.f.x.v v9, v8
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    ret
+  %z = sitofp <2 x i1> %x to <2 x bfloat>
+  ret <2 x bfloat> %z
+}
+
+define <2 x bfloat> @ui2fp_v2i1_v2bf16(<2 x i1> %x) {
+; CHECK-LABEL: ui2fp_v2i1_v2bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    ret
+  %z = uitofp <2 x i1> %x to <2 x bfloat>
+  ret <2 x bfloat> %z
+}
+
+define void @si2fp_v8i64_v8bf16(ptr %x, ptr %y) {
+; CHECK-LABEL: si2fp_v8i64_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vfncvt.f.x.w v12, v8
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT:    vse16.v v8, (a1)
+; CHECK-NEXT:    ret
+  %a = load <8 x i64>, ptr %x
+  %d = sitofp <8 x i64> %a to <8 x bfloat>
+  store <8 x bfloat> %d, ptr %y
+  ret void
+}
+
+define void @ui2fp_v8i64_v8bf16(ptr %x, ptr %y) {
+; CHECK-LABEL: ui2fp_v8i64_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vfncvt.f.xu.w v12, v8
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
+; CHECK-NEXT:    vse16.v v8, (a1)
+; CHECK-NEXT:    ret
+  %a = load <8 x i64>, ptr %x
+  %d = uitofp <8 x i64> %a to <8 x bfloat>
+  store <8 x bfloat> %d, ptr %y
+  ret void
+}
+
+define <8 x bfloat> @si2fp_v8i1_v8bf16(<8 x i1> %x) {
+; CHECK-LABEL: si2fp_v8i1_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
+; CHECK-NEXT:    vfwcvt.f.x.v v10, v8
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT:    ret
+  %z = sitofp <8 x i1> %x to <8 x bfloat>
+  ret <8 x bfloat> %z
+}
+
+define <8 x bfloat> @ui2fp_v8i1_v8bf16(<8 x i1> %x) {
+; CHECK-LABEL: ui2fp_v8i1_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT:    ret
+  %z = uitofp <8 x i1> %x to <8 x bfloat>
+  ret <8 x bfloat> %z
+}
+
 define void @si2fp_v2i64_v2f16(ptr %x, ptr %y) {
 ; CHECK-LABEL: si2fp_v2i64_v2f16:
 ; CHECK:       # %bb.0:

From 0a0f100a70583725428ec317138b09f935a2b9bb Mon Sep 17 00:00:00 2001
From: Hari Limaye <hari.limaye@arm.com>
Date: Thu, 10 Oct 2024 15:03:01 +0100
Subject: [PATCH 016/177] Revert "[LTO] Run Argument Promotion before IPSCCP"
 (#111839)

Reverts llvm/llvm-project#111163, as this was merged prematurely.
---
 llvm/lib/Passes/PassBuilderPipelines.cpp | 9 ---------
 llvm/test/Other/new-pm-lto-defaults.ll   | 9 +++------
 2 files changed, 3 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index cdb9431c755bce..8f151a99b11709 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1831,15 +1831,6 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
     MPM.addPass(PGOIndirectCallPromotion(
         true /* InLTO */, PGOOpt && PGOOpt->Action == PGOOptions::SampleUse));
 
-    // Promoting by-reference arguments to by-value exposes more constants to
-    // IPSCCP.
-    MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
-        PostOrderFunctionAttrsPass(/*SkipNonRecursive*/ true)));
-    MPM.addPass(
-        createModuleToPostOrderCGSCCPassAdaptor(ArgumentPromotionPass()));
-    MPM.addPass(
-        createModuleToFunctionPassAdaptor(SROAPass(SROAOptions::ModifyCFG)));
-
     // Propagate constants at call sites into the functions they call.  This
     // opens opportunities for globalopt (and inlining) by substituting function
     // pointers passed as arguments to direct uses of functions.
diff --git a/llvm/test/Other/new-pm-lto-defaults.ll b/llvm/test/Other/new-pm-lto-defaults.ll
index 2dd754ecef4d7b..5543472df685b0 100644
--- a/llvm/test/Other/new-pm-lto-defaults.ll
+++ b/llvm/test/Other/new-pm-lto-defaults.ll
@@ -41,17 +41,14 @@
 ; CHECK-O23SZ-NEXT: PGOIndirectCallPromotion
 ; CHECK-O23SZ-NEXT: Running analysis: ProfileSummaryAnalysis
 ; CHECK-O23SZ-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis
+; CHECK-O23SZ-NEXT: Running pass: IPSCCPPass
+; CHECK-O23SZ-NEXT: Running analysis: AssumptionAnalysis on foo
+; CHECK-O23SZ-NEXT: Running pass: CalledValuePropagationPass
 ; CHECK-O-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}SCC
 ; CHECK-O-NEXT: Running analysis: LazyCallGraphAnalysis
 ; CHECK-O1-NEXT: Running analysis: TargetLibraryAnalysis
 ; CHECK-O-NEXT: Running analysis: FunctionAnalysisManagerCGSCCProxy
 ; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy<{{.*}}LazyCallGraph{{.*}}>
-; CHECK-O23SZ-NEXT: Running pass: PostOrderFunctionAttrsPass
-; CHECK-O23SZ-NEXT: Running pass: ArgumentPromotionPass
-; CHECK-O23SZ-NEXT: Running pass: SROAPass
-; CHECK-O23SZ-NEXT: Running analysis: AssumptionAnalysis on foo
-; CHECK-O23SZ-NEXT: Running pass: IPSCCPPass
-; CHECK-O23SZ-NEXT: Running pass: CalledValuePropagationPass
 ; CHECK-O-NEXT: Running pass: PostOrderFunctionAttrsPass
 ; CHECK-O-NEXT: Running analysis: AAManager
 ; CHECK-O-NEXT: Running analysis: BasicAA

From dabb0ddbd7a7229855156c61df1d35ad845361ac Mon Sep 17 00:00:00 2001
From: Vladimir Radosavljevic
 <129192835+vladimirradosavljevic@users.noreply.github.com>
Date: Thu, 10 Oct 2024 16:05:42 +0200
Subject: [PATCH 017/177] [MCP] Skip invalidating def constant regs during
 forward propagation (#111129)

Before this patch, redundant COPY couldn't be removed for the following
case:
```
  %reg1 = COPY %const-reg
  ... // There is a def of %const-reg
  %reg2 = COPY killed %reg1
```
where this can be optimized to:
```
  ... // There is a def of %const-reg
  %reg2 = COPY %const-reg
```

This patch allows for such optimization by not invalidating defined
constant registers. This is safe, as architectures like AArch64 and
RISCV replace a dead definition of a GPR with a zero constant register
for certain instructions.
---
 llvm/lib/CodeGen/MachineCopyPropagation.cpp   |  7 +++++--
 .../AArch64/machine-cp-constant-reg.mir       | 19 +++++++++++++++++++
 2 files changed, 24 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/machine-cp-constant-reg.mir

diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index 8bcc437cbfb865..fb4da2c11cda77 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -886,8 +886,11 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
              "MachineCopyPropagation should be run after register allocation!");
 
       if (MO.isDef() && !MO.isEarlyClobber()) {
-        Defs.push_back(Reg.asMCReg());
-        continue;
+        // Skip invalidating constant registers.
+        if (!MRI->isConstantPhysReg(Reg)) {
+          Defs.push_back(Reg.asMCReg());
+          continue;
+        }
       } else if (MO.readsReg())
         ReadRegister(Reg.asMCReg(), MI, MO.isDebug() ? DebugUse : RegularUse);
     }
diff --git a/llvm/test/CodeGen/AArch64/machine-cp-constant-reg.mir b/llvm/test/CodeGen/AArch64/machine-cp-constant-reg.mir
new file mode 100644
index 00000000000000..cad55b9daffafd
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/machine-cp-constant-reg.mir
@@ -0,0 +1,19 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass machine-cp -verify-machineinstrs -o - %s | FileCheck %s
+
+---
+name: test
+body: |
+  bb.0:
+    liveins: $w2
+    ; CHECK-LABEL: name: test
+    ; CHECK: liveins: $w2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $wzr = SUBSWri killed renamable $w2, 0, 0, implicit-def $nzcv
+    ; CHECK-NEXT: renamable $w0 = COPY $wzr
+    ; CHECK-NEXT: RET_ReallyLR implicit killed $w0
+    renamable $w1 = COPY $wzr
+    $wzr = SUBSWri killed renamable $w2, 0, 0, implicit-def $nzcv
+    renamable $w0 = COPY killed renamable $w1
+    RET_ReallyLR implicit killed $w0
+...

From 3737a5321901574b3f4b2cf0d798faea5c4a2302 Mon Sep 17 00:00:00 2001
From: Tyler Nowicki <tyler.nowicki@amd.com>
Date: Thu, 10 Oct 2024 10:08:45 -0400
Subject: [PATCH 018/177] [Coroutines] Support for Custom ABIs (#111755)

This change extends the current method for creating ABI object to allow
users (plugin libraries) to create custom ABI objects for their needs.
This is accomplished by inheriting one of the common ABIs and overriding
one or more of the methods to create a custom ABI. To use a custom ABI
for a given coroutine the coro.begin.custom.abi intrinsic is used in
place of the coro.begin intrinsic. This takes an additional i32 arg that
specifies the index of an ABI generator for the custom ABI object in a
SmallVector passed to the CoroSplitPass ctor.

The detailed changes include:
* Add the llvm.coro.begin.custom intrinsic used to specify the index of
the custom ABI to use for the given coroutine.
* Add constructors to CoroSplit that take a list of generators that
create the custom ABI object.
* Extend the CreateNewABI function used by CoroSplit to return a
unique_ptr to an ABI object.
* Add has/getCustomABI methods to CoroBeginInst class.
* Add a unittest for a custom ABI.

See doc update here: https://github.com/llvm/llvm-project/pull/111781
---
 .../llvm/Analysis/TargetTransformInfoImpl.h   |  1 +
 llvm/include/llvm/IR/Intrinsics.td            |  3 +-
 llvm/include/llvm/Transforms/Coroutines/ABI.h |  8 +-
 .../llvm/Transforms/Coroutines/CoroInstr.h    | 19 +++-
 .../llvm/Transforms/Coroutines/CoroSplit.h    | 13 ++-
 .../lib/Transforms/Coroutines/CoroCleanup.cpp |  4 +-
 llvm/lib/Transforms/Coroutines/CoroSplit.cpp  | 38 +++++++-
 llvm/lib/Transforms/Coroutines/Coroutines.cpp |  4 +-
 .../Transforms/Coroutines/ExtraRematTest.cpp  | 87 +++++++++++++++++++
 9 files changed, 164 insertions(+), 13 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 6d3ce93acbe451..3d0140ad7ad7a3 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -778,6 +778,7 @@ class TargetTransformInfoImplBase {
     case Intrinsic::experimental_gc_relocate:
     case Intrinsic::coro_alloc:
     case Intrinsic::coro_begin:
+    case Intrinsic::coro_begin_custom_abi:
     case Intrinsic::coro_free:
     case Intrinsic::coro_end:
     case Intrinsic::coro_frame:
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 20dd921ddbd230..8a0721cf23f538 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1719,7 +1719,8 @@ def int_coro_prepare_async : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty],
                                        [IntrNoMem]>;
 def int_coro_begin : Intrinsic<[llvm_ptr_ty], [llvm_token_ty, llvm_ptr_ty],
                                [WriteOnly<ArgIndex<1>>]>;
-
+def int_coro_begin_custom_abi : Intrinsic<[llvm_ptr_ty], [llvm_token_ty, llvm_ptr_ty, llvm_i32_ty],
+                               [WriteOnly<ArgIndex<1>>]>;
 def int_coro_free : Intrinsic<[llvm_ptr_ty], [llvm_token_ty, llvm_ptr_ty],
                               [IntrReadMem, IntrArgMemOnly,
                                ReadOnly<ArgIndex<1>>,
diff --git a/llvm/include/llvm/Transforms/Coroutines/ABI.h b/llvm/include/llvm/Transforms/Coroutines/ABI.h
index e7568d275c1615..8b83c5308056eb 100644
--- a/llvm/include/llvm/Transforms/Coroutines/ABI.h
+++ b/llvm/include/llvm/Transforms/Coroutines/ABI.h
@@ -29,7 +29,13 @@ namespace coro {
 // This interface/API is to provide an object oriented way to implement ABI
 // functionality. This is intended to replace use of the ABI enum to perform
 // ABI operations. The ABIs (e.g. Switch, Async, Retcon{Once}) are the common
-// ABIs.
+// ABIs. However, specific users may need to modify the behavior of these. This
+// can be accomplished by inheriting one of the common ABIs and overriding one
+// or more of the methods to create a custom ABI. To use a custom ABI for a
+// given coroutine the coro.begin.custom.abi intrinsic is used in place of the
+// coro.begin intrinsic. This takes an additional i32 arg that specifies the
+// index of an ABI generator for the custom ABI object in a SmallVector passed
+// to CoroSplitPass ctor.
 
 class BaseABI {
 public:
diff --git a/llvm/include/llvm/Transforms/Coroutines/CoroInstr.h b/llvm/include/llvm/Transforms/Coroutines/CoroInstr.h
index a329a06bf13891..3aa30bec85c3a5 100644
--- a/llvm/include/llvm/Transforms/Coroutines/CoroInstr.h
+++ b/llvm/include/llvm/Transforms/Coroutines/CoroInstr.h
@@ -124,7 +124,8 @@ class AnyCoroIdInst : public IntrinsicInst {
   IntrinsicInst *getCoroBegin() {
     for (User *U : users())
       if (auto *II = dyn_cast<IntrinsicInst>(U))
-        if (II->getIntrinsicID() == Intrinsic::coro_begin)
+        if (II->getIntrinsicID() == Intrinsic::coro_begin ||
+            II->getIntrinsicID() == Intrinsic::coro_begin_custom_abi)
           return II;
     llvm_unreachable("no coro.begin associated with coro.id");
   }
@@ -442,20 +443,30 @@ class CoroFreeInst : public IntrinsicInst {
   }
 };
 
-/// This class represents the llvm.coro.begin instructions.
+/// This class represents the llvm.coro.begin or llvm.coro.begin.custom.abi
+/// instructions.
 class CoroBeginInst : public IntrinsicInst {
-  enum { IdArg, MemArg };
+  enum { IdArg, MemArg, CustomABIArg };
 
 public:
   AnyCoroIdInst *getId() const {
     return cast<AnyCoroIdInst>(getArgOperand(IdArg));
   }
 
+  bool hasCustomABI() const {
+    return getIntrinsicID() == Intrinsic::coro_begin_custom_abi;
+  }
+
+  int getCustomABI() const {
+    return cast<ConstantInt>(getArgOperand(CustomABIArg))->getZExtValue();
+  }
+
   Value *getMem() const { return getArgOperand(MemArg); }
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
   static bool classof(const IntrinsicInst *I) {
-    return I->getIntrinsicID() == Intrinsic::coro_begin;
+    return I->getIntrinsicID() == Intrinsic::coro_begin ||
+           I->getIntrinsicID() == Intrinsic::coro_begin_custom_abi;
   }
   static bool classof(const Value *V) {
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
diff --git a/llvm/include/llvm/Transforms/Coroutines/CoroSplit.h b/llvm/include/llvm/Transforms/Coroutines/CoroSplit.h
index a5fd57f8f9dfab..6c6a982e828050 100644
--- a/llvm/include/llvm/Transforms/Coroutines/CoroSplit.h
+++ b/llvm/include/llvm/Transforms/Coroutines/CoroSplit.h
@@ -28,17 +28,26 @@ struct Shape;
 } // namespace coro
 
 struct CoroSplitPass : PassInfoMixin<CoroSplitPass> {
+  using BaseABITy =
+      std::function<std::unique_ptr<coro::BaseABI>(Function &, coro::Shape &)>;
 
   CoroSplitPass(bool OptimizeFrame = false);
+
+  CoroSplitPass(SmallVector<BaseABITy> GenCustomABIs,
+                bool OptimizeFrame = false);
+
   CoroSplitPass(std::function<bool(Instruction &)> MaterializableCallback,
                 bool OptimizeFrame = false);
 
+  CoroSplitPass(std::function<bool(Instruction &)> MaterializableCallback,
+                SmallVector<BaseABITy> GenCustomABIs,
+                bool OptimizeFrame = false);
+
   PreservedAnalyses run(LazyCallGraph::SCC &C, CGSCCAnalysisManager &AM,
                         LazyCallGraph &CG, CGSCCUpdateResult &UR);
+
   static bool isRequired() { return true; }
 
-  using BaseABITy =
-      std::function<std::unique_ptr<coro::BaseABI>(Function &, coro::Shape &)>;
   // Generator for an ABI transformer
   BaseABITy CreateAndInitABI;
 
diff --git a/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp b/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp
index dd92b3593af92e..1cda7f93f72a2c 100644
--- a/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp
@@ -53,6 +53,7 @@ bool Lowerer::lower(Function &F) {
       default:
         continue;
       case Intrinsic::coro_begin:
+      case Intrinsic::coro_begin_custom_abi:
         II->replaceAllUsesWith(II->getArgOperand(1));
         break;
       case Intrinsic::coro_free:
@@ -112,7 +113,8 @@ static bool declaresCoroCleanupIntrinsics(const Module &M) {
       M, {"llvm.coro.alloc", "llvm.coro.begin", "llvm.coro.subfn.addr",
           "llvm.coro.free", "llvm.coro.id", "llvm.coro.id.retcon",
           "llvm.coro.id.async", "llvm.coro.id.retcon.once",
-          "llvm.coro.async.size.replace", "llvm.coro.async.resume"});
+          "llvm.coro.async.size.replace", "llvm.coro.async.resume",
+          "llvm.coro.begin.custom.abi"});
 }
 
 PreservedAnalyses CoroCleanupPass::run(Module &M,
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index ef1f27118bc14b..88ce331c8cfb64 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -2200,7 +2200,15 @@ static void addPrepareFunction(const Module &M,
 
 static std::unique_ptr<coro::BaseABI>
 CreateNewABI(Function &F, coro::Shape &S,
-             std::function<bool(Instruction &)> IsMatCallback) {
+             std::function<bool(Instruction &)> IsMatCallback,
+             const SmallVector<CoroSplitPass::BaseABITy> GenCustomABIs) {
+  if (S.CoroBegin->hasCustomABI()) {
+    unsigned CustomABI = S.CoroBegin->getCustomABI();
+    if (CustomABI >= GenCustomABIs.size())
+      llvm_unreachable("Custom ABI not found amoung those specified");
+    return GenCustomABIs[CustomABI](F, S);
+  }
+
   switch (S.ABI) {
   case coro::ABI::Switch:
     return std::unique_ptr<coro::BaseABI>(
@@ -2221,7 +2229,17 @@ CreateNewABI(Function &F, coro::Shape &S,
 CoroSplitPass::CoroSplitPass(bool OptimizeFrame)
     : CreateAndInitABI([](Function &F, coro::Shape &S) {
         std::unique_ptr<coro::BaseABI> ABI =
-            CreateNewABI(F, S, coro::isTriviallyMaterializable);
+            CreateNewABI(F, S, coro::isTriviallyMaterializable, {});
+        ABI->init();
+        return ABI;
+      }),
+      OptimizeFrame(OptimizeFrame) {}
+
+CoroSplitPass::CoroSplitPass(
+    SmallVector<CoroSplitPass::BaseABITy> GenCustomABIs, bool OptimizeFrame)
+    : CreateAndInitABI([=](Function &F, coro::Shape &S) {
+        std::unique_ptr<coro::BaseABI> ABI =
+            CreateNewABI(F, S, coro::isTriviallyMaterializable, GenCustomABIs);
         ABI->init();
         return ABI;
       }),
@@ -2232,7 +2250,21 @@ CoroSplitPass::CoroSplitPass(bool OptimizeFrame)
 CoroSplitPass::CoroSplitPass(std::function<bool(Instruction &)> IsMatCallback,
                              bool OptimizeFrame)
     : CreateAndInitABI([=](Function &F, coro::Shape &S) {
-        std::unique_ptr<coro::BaseABI> ABI = CreateNewABI(F, S, IsMatCallback);
+        std::unique_ptr<coro::BaseABI> ABI =
+            CreateNewABI(F, S, IsMatCallback, {});
+        ABI->init();
+        return ABI;
+      }),
+      OptimizeFrame(OptimizeFrame) {}
+
+// For back compatibility, constructor takes a materializable callback and
+// creates a generator for an ABI with a modified materializable callback.
+CoroSplitPass::CoroSplitPass(
+    std::function<bool(Instruction &)> IsMatCallback,
+    SmallVector<CoroSplitPass::BaseABITy> GenCustomABIs, bool OptimizeFrame)
+    : CreateAndInitABI([=](Function &F, coro::Shape &S) {
+        std::unique_ptr<coro::BaseABI> ABI =
+            CreateNewABI(F, S, IsMatCallback, GenCustomABIs);
         ABI->init();
         return ABI;
       }),
diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
index f4d9a7a8aa8569..1c45bcd7f6a837 100644
--- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp
+++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
@@ -73,6 +73,7 @@ static const char *const CoroIntrinsics[] = {
     "llvm.coro.await.suspend.handle",
     "llvm.coro.await.suspend.void",
     "llvm.coro.begin",
+    "llvm.coro.begin.custom.abi",
     "llvm.coro.destroy",
     "llvm.coro.done",
     "llvm.coro.end",
@@ -247,7 +248,8 @@ void coro::Shape::analyze(Function &F,
         }
         break;
       }
-      case Intrinsic::coro_begin: {
+      case Intrinsic::coro_begin:
+      case Intrinsic::coro_begin_custom_abi: {
         auto CB = cast<CoroBeginInst>(II);
 
         // Ignore coro id's that aren't pre-split.
diff --git a/llvm/unittests/Transforms/Coroutines/ExtraRematTest.cpp b/llvm/unittests/Transforms/Coroutines/ExtraRematTest.cpp
index 1d55889a32d7aa..c3394fdaa940ba 100644
--- a/llvm/unittests/Transforms/Coroutines/ExtraRematTest.cpp
+++ b/llvm/unittests/Transforms/Coroutines/ExtraRematTest.cpp
@@ -182,4 +182,91 @@ TEST_F(ExtraRematTest, TestCoroRematWithCallback) {
   CallInst *CI = getCallByName(Resume1, "should.remat");
   ASSERT_TRUE(CI);
 }
+
+StringRef TextCoroBeginCustomABI = R"(
+    define ptr @f(i32 %n) presplitcoroutine {
+    entry:
+      %id = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null)
+      %size = call i32 @llvm.coro.size.i32()
+      %alloc = call ptr @malloc(i32 %size)
+      %hdl = call ptr @llvm.coro.begin.custom.abi(token %id, ptr %alloc, i32 0)
+
+      %inc1 = add i32 %n, 1
+      %val2 = call i32 @should.remat(i32 %inc1)
+      %sp1 = call i8 @llvm.coro.suspend(token none, i1 false)
+      switch i8 %sp1, label %suspend [i8 0, label %resume1
+                                      i8 1, label %cleanup]
+    resume1:
+      %inc2 = add i32 %val2, 1
+      %sp2 = call i8 @llvm.coro.suspend(token none, i1 false)
+      switch i8 %sp1, label %suspend [i8 0, label %resume2
+                                      i8 1, label %cleanup]
+
+    resume2:
+      call void @print(i32 %val2)
+      call void @print(i32 %inc2)
+      br label %cleanup
+
+    cleanup:
+      %mem = call ptr @llvm.coro.free(token %id, ptr %hdl)
+      call void @free(ptr %mem)
+      br label %suspend
+    suspend:
+      call i1 @llvm.coro.end(ptr %hdl, i1 0)
+      ret ptr %hdl
+    }
+
+    declare ptr @llvm.coro.free(token, ptr)
+    declare i32 @llvm.coro.size.i32()
+    declare i8  @llvm.coro.suspend(token, i1)
+    declare void @llvm.coro.resume(ptr)
+    declare void @llvm.coro.destroy(ptr)
+
+    declare token @llvm.coro.id(i32, ptr, ptr, ptr)
+    declare i1 @llvm.coro.alloc(token)
+    declare ptr @llvm.coro.begin.custom.abi(token, ptr, i32)
+    declare i1 @llvm.coro.end(ptr, i1)
+
+    declare i32 @should.remat(i32)
+
+    declare noalias ptr @malloc(i32)
+    declare void @print(i32)
+    declare void @free(ptr)
+  )";
+
+// SwitchABI with overridden isMaterializable
+class ExtraCustomABI : public coro::SwitchABI {
+public:
+  ExtraCustomABI(Function &F, coro::Shape &S)
+      : coro::SwitchABI(F, S, ExtraMaterializable) {}
+};
+
+TEST_F(ExtraRematTest, TestCoroRematWithCustomABI) {
+  ParseAssembly(TextCoroBeginCustomABI);
+
+  ASSERT_TRUE(M);
+
+  CoroSplitPass::BaseABITy GenCustomABI = [](Function &F, coro::Shape &S) {
+    return std::unique_ptr<coro::BaseABI>(new ExtraCustomABI(F, S));
+  };
+
+  CGSCCPassManager CGPM;
+  CGPM.addPass(CoroSplitPass({GenCustomABI}));
+  MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM)));
+  MPM.run(*M, MAM);
+
+  // Verify that extra rematerializable instruction has been rematerialized
+  Function *F = M->getFunction("f.resume");
+  ASSERT_TRUE(F) << "could not find split function f.resume";
+
+  BasicBlock *Resume1 = getBasicBlockByName(F, "resume1");
+  ASSERT_TRUE(Resume1)
+      << "could not find expected BB resume1 in split function";
+
+  // With callback the extra rematerialization of the function should have
+  // happened
+  CallInst *CI = getCallByName(Resume1, "should.remat");
+  ASSERT_TRUE(CI);
+}
+
 } // namespace

From 005e601611095f1bed4ca7e6c37c17645e75ca0c Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Thu, 10 Oct 2024 10:11:49 -0400
Subject: [PATCH 019/177] [gn] port 0e913237871e (LLDB_TEST_MAKE)

---
 llvm/utils/gn/secondary/lldb/test/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/lldb/test/BUILD.gn b/llvm/utils/gn/secondary/lldb/test/BUILD.gn
index 749fda78b4f171..cb6380882e7cd7 100644
--- a/llvm/utils/gn/secondary/lldb/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/test/BUILD.gn
@@ -68,6 +68,7 @@ write_lit_cfg("lit_api_site_cfg") {
     "LLDB_TEST_BUILD_DIRECTORY=" + rebase_path("$target_gen_dir/test_build"),
     "LLDB_TEST_DSYMUTIL=" + rebase_path("$root_build_dir/bin/dsymutil"),
     "LLDB_TEST_EXECUTABLE=" + rebase_path("$root_build_dir/bin/lldb"),
+    "LLDB_TEST_MAKE=make",
     "LLDB_TEST_MODULE_CACHE_CLANG=" +
         rebase_path(
             "$target_gen_dir/lldb-test-build.noindex/module-cache-clang"),

From 545e0593f8c59376a7ec8c6eb558babf6c9f93c1 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Thu, 10 Oct 2024 07:22:28 -0700
Subject: [PATCH 020/177] [libc] Clean up 'vasprintf' implementation (#111761)

Summary:
This had some leftover references to the old namespace and didn't put
restrict on it.
---
 libc/src/stdio/asprintf.cpp                     | 7 ++++---
 libc/src/stdio/asprintf.h                       | 2 +-
 libc/src/stdio/printf_core/vasprintf_internal.h | 6 +++---
 libc/src/stdio/vasprintf.cpp                    | 7 ++++---
 libc/src/stdio/vasprintf.h                      | 3 ++-
 5 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/libc/src/stdio/asprintf.cpp b/libc/src/stdio/asprintf.cpp
index 88b458a9e103bf..f8cfb74ce48ea2 100644
--- a/libc/src/stdio/asprintf.cpp
+++ b/libc/src/stdio/asprintf.cpp
@@ -11,10 +11,11 @@
 #include "src/__support/macros/config.h"
 #include "src/stdio/printf_core/vasprintf_internal.h"
 
-namespace LIBC_NAMESPACE {
+namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, asprintf,
-                   (char **__restrict buffer, const char *format, ...)) {
+                   (char **__restrict buffer, const char *__restrict format,
+                    ...)) {
   va_list vlist;
   va_start(vlist, format);
   internal::ArgList args(vlist); // This holder class allows for easier copying
@@ -25,4 +26,4 @@ LLVM_LIBC_FUNCTION(int, asprintf,
   return ret;
 }
 
-} // namespace LIBC_NAMESPACE
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/asprintf.h b/libc/src/stdio/asprintf.h
index 0c0d5a350829e7..222dfdee9d4fd7 100644
--- a/libc/src/stdio/asprintf.h
+++ b/libc/src/stdio/asprintf.h
@@ -13,7 +13,7 @@
 
 namespace LIBC_NAMESPACE {
 
-int asprintf(char **__restrict s, const char *format, ...);
+int asprintf(char **__restrict s, const char *__restrict format, ...);
 
 } // namespace LIBC_NAMESPACE
 
diff --git a/libc/src/stdio/printf_core/vasprintf_internal.h b/libc/src/stdio/printf_core/vasprintf_internal.h
index 24ebc02a0b33f2..e3448eebd302b7 100644
--- a/libc/src/stdio/printf_core/vasprintf_internal.h
+++ b/libc/src/stdio/printf_core/vasprintf_internal.h
@@ -13,7 +13,7 @@
 #include "src/stdio/printf_core/writer.h"
 #include <stdlib.h> // malloc, realloc, free
 
-namespace LIBC_NAMESPACE {
+namespace LIBC_NAMESPACE_DECL {
 namespace printf_core {
 
 LIBC_INLINE int resize_overflow_hook(cpp::string_view new_str, void *target) {
@@ -40,7 +40,7 @@ LIBC_INLINE int resize_overflow_hook(cpp::string_view new_str, void *target) {
 
 constexpr size_t DEFAULT_BUFFER_SIZE = 200;
 
-LIBC_INLINE int vasprintf_internal(char **ret, const char *format,
+LIBC_INLINE int vasprintf_internal(char **ret, const char *__restrict format,
                                    internal::ArgList args) {
   char init_buff_on_stack[DEFAULT_BUFFER_SIZE];
   printf_core::WriteBuffer wb(init_buff_on_stack, DEFAULT_BUFFER_SIZE,
@@ -64,4 +64,4 @@ LIBC_INLINE int vasprintf_internal(char **ret, const char *format,
   return ret_val;
 }
 } // namespace printf_core
-} // namespace LIBC_NAMESPACE
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/vasprintf.cpp b/libc/src/stdio/vasprintf.cpp
index 7fa4cc6f127dda..4a44d4a0f88426 100644
--- a/libc/src/stdio/vasprintf.cpp
+++ b/libc/src/stdio/vasprintf.cpp
@@ -10,14 +10,15 @@
 #include "src/__support/arg_list.h"
 #include "src/stdio/printf_core/vasprintf_internal.h"
 
-namespace LIBC_NAMESPACE {
+namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, vasprintf,
-                   (char **__restrict ret, const char *format, va_list vlist)) {
+                   (char **__restrict ret, const char *__restrict format,
+                    va_list vlist)) {
   internal::ArgList args(vlist); // This holder class allows for easier copying
                                  // and pointer semantics, as well as handling
                                  // destruction automatically.
   return printf_core::vasprintf_internal(ret, format, args);
 }
 
-} // namespace LIBC_NAMESPACE
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/vasprintf.h b/libc/src/stdio/vasprintf.h
index 792e948cf1850c..8b286fe69bf203 100644
--- a/libc/src/stdio/vasprintf.h
+++ b/libc/src/stdio/vasprintf.h
@@ -13,7 +13,8 @@
 
 namespace LIBC_NAMESPACE {
 
-int vasprintf(char **__restrict s, const char *format, va_list vlist);
+int vasprintf(char **__restrict s, const char *__restrict format,
+              va_list vlist);
 
 } // namespace LIBC_NAMESPACE
 

From e023d0270eb32cacdc720bbeea262b2869f7e9e4 Mon Sep 17 00:00:00 2001
From: Brox Chen <guochen2@amd.com>
Date: Thu, 10 Oct 2024 10:43:26 -0400
Subject: [PATCH 021/177] [AMDGPU][test]update error dasm test for
 update-mc-test-check script (#111760)

The previous error test line is using a 16bit instruction to indicate an
error. However this is a poor pick.

The 16bit instructions on AMDGPU is under development and thus, some
downstream branches are not showing this exact error message. Changing
it to another error dasm code.
---
 .../update_mc_test_checks/Inputs/amdgpu_dasm.txt                | 2 +-
 .../update_mc_test_checks/Inputs/amdgpu_dasm.txt.expected       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/tools/UpdateTestChecks/update_mc_test_checks/Inputs/amdgpu_dasm.txt b/llvm/test/tools/UpdateTestChecks/update_mc_test_checks/Inputs/amdgpu_dasm.txt
index 9f5fba6e50df25..e8338577cfc47c 100644
--- a/llvm/test/tools/UpdateTestChecks/update_mc_test_checks/Inputs/amdgpu_dasm.txt
+++ b/llvm/test/tools/UpdateTestChecks/update_mc_test_checks/Inputs/amdgpu_dasm.txt
@@ -2,4 +2,4 @@
 
 0x00,0x00,0x00,0x7e
 
-0xfd,0xb8,0x0a,0x7f
+0x00,0x00,0x00,0x01
diff --git a/llvm/test/tools/UpdateTestChecks/update_mc_test_checks/Inputs/amdgpu_dasm.txt.expected b/llvm/test/tools/UpdateTestChecks/update_mc_test_checks/Inputs/amdgpu_dasm.txt.expected
index 1b64695fc29408..a6f7abcb1774ac 100644
--- a/llvm/test/tools/UpdateTestChecks/update_mc_test_checks/Inputs/amdgpu_dasm.txt.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_mc_test_checks/Inputs/amdgpu_dasm.txt.expected
@@ -4,5 +4,5 @@
 0x00,0x00,0x00,0x7e
 # CHECK: v_nop                                   ; encoding: [0x00,0x00,0x00,0x7e]
 
-0xfd,0xb8,0x0a,0x7f
+0x00,0x00,0x00,0x01
 # CHECK: :[[@LINE-1]]:1: warning: invalid instruction encoding

From f59b0c76030aff268b78d475e219708d06b982b5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Thu, 10 Oct 2024 15:45:52 +0100
Subject: [PATCH 022/177] [mlir][linalg][nfc] Delete references to
 args_in/args_out (#111517)

After the refactor in:
  * ed229132f1c4ea2ba0644fc345d8279e47a00565,

the `args_in` and `args_out` attributes are no longer used by
`linalg.generic`. This patch removes most the remaining references.
I've left out BufferDeallocationInternals.md, which doesn't seem
maintained anymore and is quite out of sync with other bits of MLIR
(e.g. `test.generic` instead of `linalg.generic`).
---
 .../Dialect/Bufferization/Transforms/Passes.td     |  7 -------
 .../lib/Dialect/Linalg/Transforms/DropUnitDims.cpp |  4 ----
 mlir/test/Dialect/Linalg/loops.mlir                | 14 --------------
 mlir/test/Dialect/Linalg/transform-patterns.mlir   |  2 --
 .../Linalg/vectorization-with-patterns.mlir        |  8 --------
 5 files changed, 35 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
index a610ddcc9899ed..a683a905cd2d6b 100644
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
@@ -32,8 +32,6 @@ def BufferDeallocation : Pass<"buffer-deallocation", "func::FuncOp"> {
       ^bb2:
         %0 = memref.alloc() : memref<2xf32>
         linalg.generic {
-          args_in = 1 : i64,
-          args_out = 1 : i64,
           indexing_maps = [#map0, #map0],
           iterator_types = ["parallel"]} %arg1, %0 {
         ^bb0(%gen1_arg0: f32, %gen1_arg1: f32):
@@ -63,8 +61,6 @@ def BufferDeallocation : Pass<"buffer-deallocation", "func::FuncOp"> {
       ^bb2:  // pred: ^bb0
         %1 = memref.alloc() : memref<2xf32>
         linalg.generic {
-          args_in = 1 : i64,
-          args_out = 1 : i64,
           indexing_maps = [#map0, #map0],
           iterator_types = ["parallel"]} %arg1, %1 {
         ^bb0(%arg3: f32, %arg4: f32):
@@ -143,8 +139,6 @@ def OwnershipBasedBufferDeallocation : Pass<
       ^bb2:
         %0 = memref.alloc() : memref<2xf32>
         linalg.generic {
-          args_in = 1 : i64,
-          args_out = 1 : i64,
           indexing_maps = [#map0, #map0],
           iterator_types = ["parallel"]}
         outs(%arg1, %0 : memref<2xf32>, memref<2xf32>) {
@@ -179,7 +173,6 @@ def OwnershipBasedBufferDeallocation : Pass<
           indexing_maps = [#map, #map],
           iterator_types = ["parallel"]}
         outs(%arg1, %alloc : memref<2xf32>, memref<2xf32>)
-        attrs =  {args_in = 1 : i64, args_out = 1 : i64} {
         ^bb0(%out: f32, %out_0: f32):
           %2 = math.exp %out : f32
           linalg.yield %2, %out_0 : f32, f32
diff --git a/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp b/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
index 90ee0fb3bf0b6b..bacc634f5ee554 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
@@ -178,8 +178,6 @@ struct MoveInitOperandsToInput : public OpRewritePattern<GenericOp> {
 /// ]
 ///
 /// #trait = {
-///   args_in = 2,
-///   args_out = 1,
 ///   indexing_maps = #accesses,
 ///   iterator_types = ["parallel", "parallel"],
 ///   library_call = "some_external_fn"
@@ -210,8 +208,6 @@ struct MoveInitOperandsToInput : public OpRewritePattern<GenericOp> {
 /// ]
 ///
 /// #trait = {
-///   args_in = 2,
-///   args_out = 1,
 ///   indexing_maps = #accesses,
 ///   iterator_types = ["parallel", "parallel"],
 ///   library_call = "some_external_fn"
diff --git a/mlir/test/Dialect/Linalg/loops.mlir b/mlir/test/Dialect/Linalg/loops.mlir
index 6ddbd06389f5eb..6286a11c11a21f 100644
--- a/mlir/test/Dialect/Linalg/loops.mlir
+++ b/mlir/test/Dialect/Linalg/loops.mlir
@@ -254,8 +254,6 @@ func.func @copy_view(%arg0: memref<?xf32, strided<[1], offset: ?>>, %arg1: memre
   affine_map<(i, j, k) -> (i, k, j)>
 ]
 #trait2 = {
-  args_in = 1,
-  args_out = 2,
   iterator_types = ["parallel", "parallel", "parallel"],
   indexing_maps = #accesses,
   library_call = "some_external_function_name_2",
@@ -296,8 +294,6 @@ func.func @generic_region(%arg0: memref<?x?xf32, strided<[?, 1], offset: ?>>, %a
 //       CHECKPARALLEL:   store %[[e]], %{{.*}}[%[[i]], %[[k]], %[[j]]] : memref<?x?x?xf32, strided<[?, ?, 1], offset: ?>>
 
 #trait4 = {
-  args_in = 1,
-  args_out = 2,
   iterator_types = ["parallel", "parallel", "parallel"],
   indexing_maps = #accesses,
   library_call = "some_external_function_name_2",
@@ -366,8 +362,6 @@ func.func @generic_index_region(
 ]
 
 #trait_broadcast = {
-  args_in = 1,
-  args_out = 1,
   indexing_maps = #broadcast_access,
   iterator_types = ["parallel", "parallel"],
   library_call = "some_broadcast_external_fn"
@@ -466,8 +460,6 @@ func.func @generic_index_op_zero_rank(%arg0: memref<i32>, %arg1: memref<3x4xi32>
 ]
 
 #trait_reduce_1D = {
-  args_in = 1,
-  args_out = 1,
   indexing_maps = #reduce_1D_access,
   iterator_types = ["reduction"],
   library_call = "some_reduce_external_fn"
@@ -510,8 +502,6 @@ func.func @generic_op_1D_reduce(%arg0: memref<?xf32>, %arg1: memref<f32>)
 ]
 
 #trait_reduce_init_1D = {
-  args_in = 2,
-  args_out = 1,
   indexing_maps = #reduce_init_1D_access,
   iterator_types = ["reduction"],
   library_call = "some_reduce_external_fn"
@@ -559,8 +549,6 @@ func.func @generic_index_op_1D_reduce(%arg0: memref<?xf32>,
 //       CHECKPARALLEL:   store %[[e]], %[[ARG2]][]
 
 #trait_const_fill = {
-  args_in = 0,
-  args_out = 1,
   indexing_maps = [affine_map<(i) -> (i)>],
   iterator_types = ["parallel"],
   library_call = "some_external_fn"
@@ -591,8 +579,6 @@ func.func @generic_const_init(%arg0: memref<?xf32>) {
   affine_map<() -> ()>
 ]
 #scalar_trait = {
-  args_in = 2,
-  args_out = 1,
   iterator_types = [],
   indexing_maps = #scalar_access,
   library_call = "some_external_fn"
diff --git a/mlir/test/Dialect/Linalg/transform-patterns.mlir b/mlir/test/Dialect/Linalg/transform-patterns.mlir
index 87b7664198dae1..176e55e3e6c4aa 100644
--- a/mlir/test/Dialect/Linalg/transform-patterns.mlir
+++ b/mlir/test/Dialect/Linalg/transform-patterns.mlir
@@ -118,8 +118,6 @@ module attributes {transform.with_named_sequence} {
   affine_map<(m, n, k) -> (m, n)>
 ]
 #generic_matmul_trait = {
-  args_in = 2,
-  args_out = 1,
   indexing_maps = #matmul_accesses,
   library_call = "linalg_matmul",
   iterator_types = ["parallel", "parallel", "reduction"]
diff --git a/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir b/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir
index e7beb725471123..1c6a786bfa436d 100644
--- a/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir
@@ -83,8 +83,6 @@ module attributes {transform.with_named_sequence} {
 // -----
 
 #matmul_trait = {
-  args_in = 2,
-  args_out = 1,
   indexing_maps = [
     affine_map<(m, n, k) -> (m, k)>,
     affine_map<(m, n, k) -> (k, n)>,
@@ -125,8 +123,6 @@ module attributes {transform.with_named_sequence} {
 // -----
 
 #matmul_transpose_out_trait = {
-  args_in = 2,
-  args_out = 1,
   indexing_maps = [
     affine_map<(m, n, k) -> (m, k)>,
     affine_map<(m, n, k) -> (k, n)>,
@@ -196,8 +192,6 @@ module attributes {transform.with_named_sequence} {
 // -----
 
 #matmul_trait = {
-  args_in = 2,
-  args_out = 1,
   indexing_maps = [
     affine_map<(m, n, k) -> (m, k)>,
     affine_map<(m, n, k) -> (k, n)>,
@@ -528,8 +522,6 @@ func.func @generic_vectorize(%arg0: memref<4x256xf32>,
   //   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
   %c1_f32 = arith.constant 1.0 : f32
   linalg.generic {
-    args_in = 0 : i64,
-    args_out = 10 : i64,
     indexing_maps = [
       affine_map<(d0, d1) -> (d0, d1)>,
       affine_map<(d0, d1) -> (d1)>,

From 058ede06c4ffd4e3c9f54d947e3bfb027c2d0557 Mon Sep 17 00:00:00 2001
From: Vladislav Dzhidzhoev <vdzhidzhoev@accesssoftek.com>
Date: Thu, 10 Oct 2024 17:14:13 +0200
Subject: [PATCH 023/177] [lldb][test] Use `xcrun -f strip` for API tests on
 Darwin (#111842)

A follow-up for https://github.com/llvm/llvm-project/pull/111816.

This is to fix buildbot failure
https://lab.llvm.org/staging/#/builders/195/builds/4242.

TestSymbolFileJSON.py doesn't pass with llvm-strip on macOS. Apparently,
llvm-strip/llvm-objcopy can't clean symbols from Mach-O nlists.
---
 lldb/packages/Python/lldbsuite/test/builders/builder.py | 4 ++++
 lldb/test/API/functionalities/json/symbol-file/Makefile | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/lldb/packages/Python/lldbsuite/test/builders/builder.py b/lldb/packages/Python/lldbsuite/test/builders/builder.py
index d399a5b228c131..de057324694486 100644
--- a/lldb/packages/Python/lldbsuite/test/builders/builder.py
+++ b/lldb/packages/Python/lldbsuite/test/builders/builder.py
@@ -10,6 +10,7 @@
 import lldbsuite.test.lldbutil as lldbutil
 from lldbsuite.test import configuration
 from lldbsuite.test_event import build_exception
+from lldbsuite.support import seven
 
 
 class Builder:
@@ -190,6 +191,9 @@ def getToolchainUtil(util_name):
                 if not util_paths["DWP"]:
                     del util_paths["DWP"]
 
+            if lldbplatformutil.platformIsDarwin():
+                util_paths["STRIP"] = seven.get_command_output("xcrun -f strip")
+
             for var, path in util_paths.items():
                 utils.append("%s=%s" % (var, path))
 
diff --git a/lldb/test/API/functionalities/json/symbol-file/Makefile b/lldb/test/API/functionalities/json/symbol-file/Makefile
index aff841c364299c..13bc164582eeee 100644
--- a/lldb/test/API/functionalities/json/symbol-file/Makefile
+++ b/lldb/test/API/functionalities/json/symbol-file/Makefile
@@ -3,6 +3,6 @@ C_SOURCES := main.c
 all: stripped.out
 
 stripped.out : a.out
-	strip a.out -o stripped.out
+	$(STRIP) a.out -o stripped.out
 
 include Makefile.rules

From 77c842f44cc06951975fd4a85761e0bc830d185a Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 10 Oct 2024 08:15:48 -0700
Subject: [PATCH 024/177] [clang-apply-replacements] Avoid repeated hash
 lookups (NFC) (#111783)

---
 .../lib/Tooling/ApplyReplacements.cpp                      | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/clang-tools-extra/clang-apply-replacements/lib/Tooling/ApplyReplacements.cpp b/clang-tools-extra/clang-apply-replacements/lib/Tooling/ApplyReplacements.cpp
index 9e0da82dfd3806..b895075e4f31cc 100644
--- a/clang-tools-extra/clang-apply-replacements/lib/Tooling/ApplyReplacements.cpp
+++ b/clang-tools-extra/clang-apply-replacements/lib/Tooling/ApplyReplacements.cpp
@@ -148,11 +148,8 @@ groupReplacements(const TUReplacements &TUs, const TUDiagnostics &TUDs,
 
     if (auto Entry = SM.getFileManager().getOptionalFileRef(Path)) {
       if (SourceTU) {
-        auto &Replaces = DiagReplacements[*Entry];
-        auto It = Replaces.find(R);
-        if (It == Replaces.end())
-          Replaces.emplace(R, SourceTU);
-        else if (It->second != SourceTU)
+        auto [It, Inserted] = DiagReplacements[*Entry].try_emplace(R, SourceTU);
+        if (!Inserted && It->second != SourceTU)
           // This replacement is a duplicate of one suggested by another TU.
           return;
       }

From d2a96d170a4faa0a6c42fe5f23c073891d6118b8 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 10 Oct 2024 08:16:51 -0700
Subject: [PATCH 025/177] [clang-change-namespace] Avoid repeated hash lookups
 (NFC) (#111784)

---
 clang-tools-extra/clang-change-namespace/ChangeNamespace.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/clang-tools-extra/clang-change-namespace/ChangeNamespace.cpp b/clang-tools-extra/clang-change-namespace/ChangeNamespace.cpp
index 879c0d26d472a8..850df7daf5c038 100644
--- a/clang-tools-extra/clang-change-namespace/ChangeNamespace.cpp
+++ b/clang-tools-extra/clang-change-namespace/ChangeNamespace.cpp
@@ -606,9 +606,8 @@ void ChangeNamespaceTool::run(
                  Result.Nodes.getNodeAs<DeclRefExpr>("func_ref")) {
     // If this reference has been processed as a function call, we do not
     // process it again.
-    if (ProcessedFuncRefs.count(FuncRef))
+    if (!ProcessedFuncRefs.insert(FuncRef).second)
       return;
-    ProcessedFuncRefs.insert(FuncRef);
     const auto *Func = Result.Nodes.getNodeAs<FunctionDecl>("func_decl");
     assert(Func);
     const auto *Context = Result.Nodes.getNodeAs<Decl>("dc");

From 670a4613fc5f29036f23fe357b0dbf017d019717 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 10 Oct 2024 08:17:25 -0700
Subject: [PATCH 026/177] [clang-tidy] Avoid repeated hash lookups (NFC)
 (#111785)

---
 .../clang-tidy/bugprone/ForwardDeclarationNamespaceCheck.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/bugprone/ForwardDeclarationNamespaceCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ForwardDeclarationNamespaceCheck.cpp
index d77df50f8fea24..080454287f28b5 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ForwardDeclarationNamespaceCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/ForwardDeclarationNamespaceCheck.cpp
@@ -146,12 +146,13 @@ void ForwardDeclarationNamespaceCheck::onEndOfTranslationUnit() {
       }
       // Check if a definition in another namespace exists.
       const auto DeclName = CurDecl->getName();
-      if (!DeclNameToDefinitions.contains(DeclName)) {
+      auto It = DeclNameToDefinitions.find(DeclName);
+      if (It == DeclNameToDefinitions.end()) {
         continue; // No definition in this translation unit, we can skip it.
       }
       // Make a warning for each definition with the same name (in other
       // namespaces).
-      const auto &Definitions = DeclNameToDefinitions[DeclName];
+      const auto &Definitions = It->second;
       for (const auto *Def : Definitions) {
         diag(CurDecl->getLocation(),
              "no definition found for %0, but a definition with "

From 35bbfbc7c0d0782bad5160662c9683b38329c7c1 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 10 Oct 2024 08:17:58 -0700
Subject: [PATCH 027/177] [XRay] Simplify code with DenseMap::operator[] (NFC)
 (#111786)

---
 llvm/lib/XRay/BlockIndexer.cpp | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/XRay/BlockIndexer.cpp b/llvm/lib/XRay/BlockIndexer.cpp
index a99a6815f0d16b..f4ba0eb5bda9cf 100644
--- a/llvm/lib/XRay/BlockIndexer.cpp
+++ b/llvm/lib/XRay/BlockIndexer.cpp
@@ -80,12 +80,9 @@ Error BlockIndexer::visit(FunctionRecord &R) {
 }
 
 Error BlockIndexer::flush() {
-  Index::iterator It;
-  std::tie(It, std::ignore) =
-      Indices.insert({{CurrentBlock.ProcessID, CurrentBlock.ThreadID}, {}});
-  It->second.push_back({CurrentBlock.ProcessID, CurrentBlock.ThreadID,
-                        CurrentBlock.WallclockTime,
-                        std::move(CurrentBlock.Records)});
+  Indices[{CurrentBlock.ProcessID, CurrentBlock.ThreadID}].push_back(
+      {CurrentBlock.ProcessID, CurrentBlock.ThreadID,
+       CurrentBlock.WallclockTime, std::move(CurrentBlock.Records)});
   CurrentBlock.ProcessID = 0;
   CurrentBlock.ThreadID = 0;
   CurrentBlock.Records = {};

From fc467b477545c9f8ef4dc36ecee4dcd2a7457787 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 10 Oct 2024 08:18:28 -0700
Subject: [PATCH 028/177] [AMDGPU] Avoid repeated hash lookups (NFC) (#111787)

---
 llvm/lib/Target/AMDGPU/R600ISelLowering.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 7e4d9d21a0b397..1b88fdd3ab2e1c 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -1647,16 +1647,18 @@ SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[],
   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
   for (unsigned i = 0; i < 4; i++) {
     unsigned Idx = Swz[i]->getAsZExtVal();
-    if (SwizzleRemap.contains(Idx))
-      Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
+    auto It = SwizzleRemap.find(Idx);
+    if (It != SwizzleRemap.end())
+      Swz[i] = DAG.getConstant(It->second, DL, MVT::i32);
   }
 
   SwizzleRemap.clear();
   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
   for (unsigned i = 0; i < 4; i++) {
     unsigned Idx = Swz[i]->getAsZExtVal();
-    if (SwizzleRemap.contains(Idx))
-      Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
+    auto It = SwizzleRemap.find(Idx);
+    if (It != SwizzleRemap.end())
+      Swz[i] = DAG.getConstant(It->second, DL, MVT::i32);
   }
 
   return BuildVector;

From 97a43242246bf4a55e68bddf3e6a0500c07803cc Mon Sep 17 00:00:00 2001
From: alx32 <103613512+alx32@users.noreply.github.com>
Date: Thu, 10 Oct 2024 08:22:48 -0700
Subject: [PATCH 029/177] [lld-macho] Fix ICF differentiation of safe_thunks
 relocs (#111811)

In `--icf=safe_thunks` mode, the linker differentiates `keepUnique`
functions by creating thunks during a post-processing step after
Identical Code Folding (ICF). While this ensures that `keepUnique`
functions themselves are not incorrectly merged, it overlooks functions
that reference these `keepUnique` symbols.

If two functions are identical except for references to different
`keepUnique` functions, the current ICF algorithm incorrectly considers
them identical because it doesn't account for the future differentiation
introduced by thunks. This leads to incorrect deduplication of functions
that should remain distinct.

To address this issue, we modify the ICF comparison to explicitly check
for references to `keepUnique` functions during deduplication. By doing
so, functions that reference different `keepUnique` symbols are
correctly identified as distinct, preventing erroneous merging and
ensuring the correctness of the linked output.
---
 lld/MachO/ICF.cpp                 | 11 +++++++
 lld/test/MachO/icf-safe-thunks.ll | 49 +++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+)

diff --git a/lld/MachO/ICF.cpp b/lld/MachO/ICF.cpp
index 2ff962b06e3679..aedaecfdeb2c01 100644
--- a/lld/MachO/ICF.cpp
+++ b/lld/MachO/ICF.cpp
@@ -147,6 +147,17 @@ bool ICF::equalsConstant(const ConcatInputSection *ia,
       isecB = rb.referent.get<InputSection *>();
     }
 
+    // Typically, we should not encounter sections marked with `keepUnique` at
+    // this point as they would have resulted in different hashes and therefore
+    // no need for a full comparison.
+    // However, in `safe_thunks` mode, it's possible for two different
+    // relocations to reference identical `keepUnique` functions that will be
+    // distinguished later via thunks - so we need to handle this case
+    // explicitly.
+    if ((isecA != isecB) && ((isecA->keepUnique && isCodeSection(isecA)) ||
+                             (isecB->keepUnique && isCodeSection(isecB))))
+      return false;
+
     if (isecA->parent != isecB->parent)
       return false;
     // Sections with identical parents should be of the same kind.
diff --git a/lld/test/MachO/icf-safe-thunks.ll b/lld/test/MachO/icf-safe-thunks.ll
index 238e90f952e160..95e00a5b98385b 100644
--- a/lld/test/MachO/icf-safe-thunks.ll
+++ b/lld/test/MachO/icf-safe-thunks.ll
@@ -22,6 +22,13 @@
 ; CHECK-ARM64-NEXT:   _func_3identical_v3_canmerge:
 ; CHECK-ARM64-NEXT:        mov {{.*}}, #0x21
 ;
+; CHECK-ARM64:        _func_call_thunked_1_nomerge:
+; CHECK-ARM64-NEXT:        stp	x29
+;
+; CHECK-ARM64:        _func_call_thunked_2_nomerge:
+; CHECK-ARM64-NEXT:   _func_call_thunked_2_merge:
+; CHECK-ARM64-NEXT:        stp	x29
+;
 ; CHECK-ARM64:        _call_all_funcs:
 ; CHECK-ARM64-NEXT:        stp  x29
 ;
@@ -43,6 +50,9 @@
 ; CHECK-ARM64-MAP-NEXT: 0x00000010 [  2] _func_3identical_v1_canmerge
 ; CHECK-ARM64-MAP-NEXT: 0x00000000 [  2] _func_3identical_v2_canmerge
 ; CHECK-ARM64-MAP-NEXT: 0x00000000 [  2] _func_3identical_v3_canmerge
+; CHECK-ARM64-MAP-NEXT: 0x00000020 [  2] _func_call_thunked_1_nomerge
+; CHECK-ARM64-MAP-NEXT: 0x00000020 [  2] _func_call_thunked_2_nomerge
+; CHECK-ARM64-MAP-NEXT: 0x00000000 [  2] _func_call_thunked_2_merge
 ; CHECK-ARM64-MAP-NEXT: 0x00000034 [  2] _call_all_funcs
 ; CHECK-ARM64-MAP-NEXT: 0x00000050 [  2] _take_func_addr
 ; CHECK-ARM64-MAP-NEXT: 0x00000004 [  2] _func_2identical_v2
@@ -125,6 +135,30 @@ entry:
   ret void
 }
 
+; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync)
+define void @func_call_thunked_1_nomerge() local_unnamed_addr #0 {
+entry:
+  tail call void @func_2identical_v1()
+  store volatile i8 77, ptr @g_val, align 1, !tbaa !5
+  ret void
+}
+
+; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync)
+define void @func_call_thunked_2_nomerge() local_unnamed_addr #0 {
+entry:
+  tail call void @func_2identical_v2()
+  store volatile i8 77, ptr @g_val, align 1, !tbaa !5
+  ret void
+}
+
+; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp memory(readwrite, argmem: none) uwtable(sync)
+define void @func_call_thunked_2_merge() local_unnamed_addr #0 {
+entry:
+  tail call void @func_2identical_v2()
+  store volatile i8 77, ptr @g_val, align 1, !tbaa !5
+  ret void
+}
+
 ; Function Attrs: mustprogress nofree noinline norecurse nounwind ssp uwtable(sync)
 define void @call_all_funcs() local_unnamed_addr #1 {
 entry:
@@ -227,6 +261,21 @@ attributes #1 = { mustprogress nofree noinline norecurse nounwind ssp uwtable(sy
 ;     g_val = 33;
 ; }
 ;
+; ATTR void func_call_thunked_1_nomerge() {
+;     func_2identical_v1();
+;     g_val = 77;
+; }
+;
+; ATTR void func_call_thunked_2_nomerge() {
+;     func_2identical_v2();
+;     g_val = 77;
+; }
+;
+; ATTR void func_call_thunked_2_merge() {
+;     func_2identical_v2();
+;     g_val = 77;
+; }
+;
 ; ATTR void call_all_funcs() {
 ;     func_unique_1();
 ;     func_unique_2_canmerge();

From 4ddc756bccb34f3d07e30c9ca96bba32cb0cf4f9 Mon Sep 17 00:00:00 2001
From: jeanPerier <jperier@nvidia.com>
Date: Thu, 10 Oct 2024 17:25:57 +0200
Subject: [PATCH 030/177] Revert "[flang] correctly deal with bind(c) derived
 type result ABI" (#111858)

Reverts llvm/llvm-project#111678

Causes ARM failure in test suite. TYPE(C_PTR) result should not regress
even if struct ABI no implemented for the target.
https://lab.llvm.org/buildbot/#/builders/143/builds/2731
I need to revisit this.
---
 .../include/flang/Optimizer/CodeGen/Target.h  |   5 -
 .../flang/Optimizer/Dialect/FIROpsSupport.h   |  21 ---
 flang/lib/Optimizer/CodeGen/Target.cpp        |  68 +--------
 flang/lib/Optimizer/CodeGen/TargetRewrite.cpp | 137 ++++--------------
 .../Optimizer/Transforms/AbstractResult.cpp   |  65 +--------
 flang/test/Fir/abstract-results-bindc.fir     |  43 ------
 flang/test/Fir/struct-return-x86-64.fir       | 120 ---------------
 7 files changed, 40 insertions(+), 419 deletions(-)
 delete mode 100644 flang/test/Fir/abstract-results-bindc.fir
 delete mode 100644 flang/test/Fir/struct-return-x86-64.fir

diff --git a/flang/include/flang/Optimizer/CodeGen/Target.h b/flang/include/flang/Optimizer/CodeGen/Target.h
index 3b38583511927a..a7161152a5c323 100644
--- a/flang/include/flang/Optimizer/CodeGen/Target.h
+++ b/flang/include/flang/Optimizer/CodeGen/Target.h
@@ -126,11 +126,6 @@ class CodeGenSpecifics {
   structArgumentType(mlir::Location loc, fir::RecordType recTy,
                      const Marshalling &previousArguments) const = 0;
 
-  /// Type representation of a `fir.type<T>` type argument when returned by
-  /// value. Such value may need to be converted to a hidden reference argument.
-  virtual Marshalling structReturnType(mlir::Location loc,
-                                       fir::RecordType eleTy) const = 0;
-
   /// Type representation of a `boxchar<n>` type argument when passed by value.
   /// An argument value may need to be passed as a (safe) reference argument.
   ///
diff --git a/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h b/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h
index fb7b1d16f62f3a..cdbefdb2341485 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h
+++ b/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h
@@ -177,27 +177,6 @@ inline mlir::NamedAttribute getAdaptToByRefAttr(Builder &builder) {
 }
 
 bool isDummyArgument(mlir::Value v);
-
-template <fir::FortranProcedureFlagsEnum Flag>
-inline bool hasProcedureAttr(fir::FortranProcedureFlagsEnumAttr flags) {
-  return flags && bitEnumContainsAny(flags.getValue(), Flag);
-}
-
-template <fir::FortranProcedureFlagsEnum Flag>
-inline bool hasProcedureAttr(mlir::Operation *op) {
-  if (auto firCallOp = mlir::dyn_cast<fir::CallOp>(op))
-    return hasProcedureAttr<Flag>(firCallOp.getProcedureAttrsAttr());
-  if (auto firCallOp = mlir::dyn_cast<fir::DispatchOp>(op))
-    return hasProcedureAttr<Flag>(firCallOp.getProcedureAttrsAttr());
-  return hasProcedureAttr<Flag>(
-      op->getAttrOfType<fir::FortranProcedureFlagsEnumAttr>(
-          getFortranProcedureFlagsAttrName()));
-}
-
-inline bool hasBindcAttr(mlir::Operation *op) {
-  return hasProcedureAttr<fir::FortranProcedureFlagsEnum::bind_c>(op);
-}
-
 } // namespace fir
 
 #endif // FORTRAN_OPTIMIZER_DIALECT_FIROPSSUPPORT_H
diff --git a/flang/lib/Optimizer/CodeGen/Target.cpp b/flang/lib/Optimizer/CodeGen/Target.cpp
index 6c148dffb0e55a..a12b59413f4456 100644
--- a/flang/lib/Optimizer/CodeGen/Target.cpp
+++ b/flang/lib/Optimizer/CodeGen/Target.cpp
@@ -100,11 +100,6 @@ struct GenericTarget : public CodeGenSpecifics {
     TODO(loc, "passing VALUE BIND(C) derived type for this target");
   }
 
-  CodeGenSpecifics::Marshalling
-  structReturnType(mlir::Location loc, fir::RecordType ty) const override {
-    TODO(loc, "returning BIND(C) derived type for this target");
-  }
-
   CodeGenSpecifics::Marshalling
   integerArgumentType(mlir::Location loc,
                       mlir::IntegerType argTy) const override {
@@ -538,8 +533,7 @@ struct TargetX86_64 : public GenericTarget<TargetX86_64> {
   /// When \p recTy is a one field record type that can be passed
   /// like the field on its own, returns the field type. Returns
   /// a null type otherwise.
-  mlir::Type passAsFieldIfOneFieldStruct(fir::RecordType recTy,
-                                         bool allowComplex = false) const {
+  mlir::Type passAsFieldIfOneFieldStruct(fir::RecordType recTy) const {
     auto typeList = recTy.getTypeList();
     if (typeList.size() != 1)
       return {};
@@ -547,8 +541,6 @@ struct TargetX86_64 : public GenericTarget<TargetX86_64> {
     if (mlir::isa<mlir::FloatType, mlir::IntegerType, fir::LogicalType>(
             fieldType))
       return fieldType;
-    if (allowComplex && mlir::isa<mlir::ComplexType>(fieldType))
-      return fieldType;
     if (mlir::isa<fir::CharacterType>(fieldType)) {
       // Only CHARACTER(1) are expected in BIND(C) contexts, which is the only
       // contexts where derived type may be passed in registers.
@@ -601,7 +593,7 @@ struct TargetX86_64 : public GenericTarget<TargetX86_64> {
     postMerge(byteOffset, Lo, Hi);
     if (Lo == ArgClass::Memory || Lo == ArgClass::X87 ||
         Lo == ArgClass::ComplexX87)
-      return passOnTheStack(loc, recTy, /*isResult=*/false);
+      return passOnTheStack(loc, recTy);
     int neededIntRegisters = 0;
     int neededSSERegisters = 0;
     if (Lo == ArgClass::SSE)
@@ -617,7 +609,7 @@ struct TargetX86_64 : public GenericTarget<TargetX86_64> {
     // all in registers or all on the stack).
     if (!hasEnoughRegisters(loc, neededIntRegisters, neededSSERegisters,
                             previousArguments))
-      return passOnTheStack(loc, recTy, /*isResult=*/false);
+      return passOnTheStack(loc, recTy);
 
     if (auto fieldType = passAsFieldIfOneFieldStruct(recTy)) {
       CodeGenSpecifics::Marshalling marshal;
@@ -649,57 +641,9 @@ struct TargetX86_64 : public GenericTarget<TargetX86_64> {
     return marshal;
   }
 
-  CodeGenSpecifics::Marshalling
-  structReturnType(mlir::Location loc, fir::RecordType recTy) const override {
-    std::uint64_t byteOffset = 0;
-    ArgClass Lo, Hi;
-    Lo = Hi = ArgClass::NoClass;
-    byteOffset = classifyStruct(loc, recTy, byteOffset, Lo, Hi);
-    mlir::MLIRContext *context = recTy.getContext();
-    postMerge(byteOffset, Lo, Hi);
-    if (Lo == ArgClass::Memory)
-      return passOnTheStack(loc, recTy, /*isResult=*/true);
-
-    // Note that X87/ComplexX87 are passed in memory, but returned via %st0
-    // %st1 registers. Here, they are returned as fp80 or {fp80, fp80} by
-    // passAsFieldIfOneFieldStruct, and LLVM will use the expected registers.
-
-    // Note that {_Complex long double} is not 100% clear from an ABI
-    // perspective because the aggregate post merger rules say it should be
-    // passed in memory because it is bigger than 2 eight bytes. This has the
-    // funny effect of
-    // {_Complex long double} return to be dealt with differently than
-    // _Complex long double.
-
-    if (auto fieldType =
-            passAsFieldIfOneFieldStruct(recTy, /*allowComplex=*/true)) {
-      if (auto complexType = mlir::dyn_cast<mlir::ComplexType>(fieldType))
-        return complexReturnType(loc, complexType.getElementType());
-      CodeGenSpecifics::Marshalling marshal;
-      marshal.emplace_back(fieldType, AT{});
-      return marshal;
-    }
-
-    if (Hi == ArgClass::NoClass || Hi == ArgClass::SSEUp) {
-      // Return a single integer or floating point argument.
-      mlir::Type lowType = pickLLVMArgType(loc, context, Lo, byteOffset);
-      CodeGenSpecifics::Marshalling marshal;
-      marshal.emplace_back(lowType, AT{});
-      return marshal;
-    }
-    // Will be returned in two different registers. Generate {lowTy, HiTy} for
-    // the LLVM IR result type.
-    CodeGenSpecifics::Marshalling marshal;
-    mlir::Type lowType = pickLLVMArgType(loc, context, Lo, 8u);
-    mlir::Type hiType = pickLLVMArgType(loc, context, Hi, byteOffset - 8u);
-    marshal.emplace_back(mlir::TupleType::get(context, {lowType, hiType}),
-                         AT{});
-    return marshal;
-  }
-
   /// Marshal an argument that must be passed on the stack.
-  CodeGenSpecifics::Marshalling
-  passOnTheStack(mlir::Location loc, mlir::Type ty, bool isResult) const {
+  CodeGenSpecifics::Marshalling passOnTheStack(mlir::Location loc,
+                                               mlir::Type ty) const {
     CodeGenSpecifics::Marshalling marshal;
     auto sizeAndAlign =
         fir::getTypeSizeAndAlignmentOrCrash(loc, ty, getDataLayout(), kindMap);
@@ -707,7 +651,7 @@ struct TargetX86_64 : public GenericTarget<TargetX86_64> {
     unsigned short align =
         std::max(sizeAndAlign.second, static_cast<unsigned short>(8));
     marshal.emplace_back(fir::ReferenceType::get(ty),
-                         AT{align, /*byval=*/!isResult, /*sret=*/isResult});
+                         AT{align, /*byval=*/true, /*sret=*/false});
     return marshal;
   }
 };
diff --git a/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp b/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp
index 04a3ea684642c8..fd56fd6bf50f44 100644
--- a/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp
+++ b/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp
@@ -142,16 +142,20 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase<TargetRewrite> {
 
   mlir::ModuleOp getModule() { return getOperation(); }
 
-  template <typename Ty, typename Callback>
+  template <typename A, typename B, typename C>
   std::optional<std::function<mlir::Value(mlir::Operation *)>>
-  rewriteCallResultType(mlir::Location loc, mlir::Type originalResTy,
-                        Ty &newResTys,
-                        fir::CodeGenSpecifics::Marshalling &newInTyAndAttrs,
-                        Callback &newOpers, mlir::Value &savedStackPtr,
-                        fir::CodeGenSpecifics::Marshalling &m) {
-    // Currently, targets mandate COMPLEX or STRUCT is a single aggregate or
-    // packed scalar, including the sret case.
-    assert(m.size() == 1 && "return type not supported on this target");
+  rewriteCallComplexResultType(
+      mlir::Location loc, A ty, B &newResTys,
+      fir::CodeGenSpecifics::Marshalling &newInTyAndAttrs, C &newOpers,
+      mlir::Value &savedStackPtr) {
+    if (noComplexConversion) {
+      newResTys.push_back(ty);
+      return std::nullopt;
+    }
+    auto m = specifics->complexReturnType(loc, ty.getElementType());
+    // Currently targets mandate COMPLEX is a single aggregate or packed
+    // scalar, including the sret case.
+    assert(m.size() == 1 && "target of complex return not supported");
     auto resTy = std::get<mlir::Type>(m[0]);
     auto attr = std::get<fir::CodeGenSpecifics::Attributes>(m[0]);
     if (attr.isSRet()) {
@@ -166,7 +170,7 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase<TargetRewrite> {
       newInTyAndAttrs.push_back(m[0]);
       newOpers.push_back(stack);
       return [=](mlir::Operation *) -> mlir::Value {
-        auto memTy = fir::ReferenceType::get(originalResTy);
+        auto memTy = fir::ReferenceType::get(ty);
         auto cast = rewriter->create<fir::ConvertOp>(loc, memTy, stack);
         return rewriter->create<fir::LoadOp>(loc, cast);
       };
@@ -176,41 +180,11 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase<TargetRewrite> {
       // We are going to generate an alloca, so save the stack pointer.
       if (!savedStackPtr)
         savedStackPtr = genStackSave(loc);
-      return this->convertValueInMemory(loc, call->getResult(0), originalResTy,
+      return this->convertValueInMemory(loc, call->getResult(0), ty,
                                         /*inputMayBeBigger=*/true);
     };
   }
 
-  template <typename Ty, typename Callback>
-  std::optional<std::function<mlir::Value(mlir::Operation *)>>
-  rewriteCallComplexResultType(
-      mlir::Location loc, mlir::ComplexType ty, Ty &newResTys,
-      fir::CodeGenSpecifics::Marshalling &newInTyAndAttrs, Callback &newOpers,
-      mlir::Value &savedStackPtr) {
-    if (noComplexConversion) {
-      newResTys.push_back(ty);
-      return std::nullopt;
-    }
-    auto m = specifics->complexReturnType(loc, ty.getElementType());
-    return rewriteCallResultType(loc, ty, newResTys, newInTyAndAttrs, newOpers,
-                                 savedStackPtr, m);
-  }
-
-  template <typename Ty, typename Callback>
-  std::optional<std::function<mlir::Value(mlir::Operation *)>>
-  rewriteCallStructResultType(
-      mlir::Location loc, fir::RecordType recTy, Ty &newResTys,
-      fir::CodeGenSpecifics::Marshalling &newInTyAndAttrs, Callback &newOpers,
-      mlir::Value &savedStackPtr) {
-    if (noStructConversion) {
-      newResTys.push_back(recTy);
-      return std::nullopt;
-    }
-    auto m = specifics->structReturnType(loc, recTy);
-    return rewriteCallResultType(loc, recTy, newResTys, newInTyAndAttrs,
-                                 newOpers, savedStackPtr, m);
-  }
-
   void passArgumentOnStackOrWithNewType(
       mlir::Location loc, fir::CodeGenSpecifics::TypeAndAttr newTypeAndAttr,
       mlir::Type oldType, mlir::Value oper,
@@ -382,11 +356,6 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase<TargetRewrite> {
                                                 newInTyAndAttrs, newOpers,
                                                 savedStackPtr);
           })
-          .template Case<fir::RecordType>([&](fir::RecordType recTy) {
-            wrap = rewriteCallStructResultType(loc, recTy, newResTys,
-                                               newInTyAndAttrs, newOpers,
-                                               savedStackPtr);
-          })
           .Default([&](mlir::Type ty) { newResTys.push_back(ty); });
     } else if (fnTy.getResults().size() > 1) {
       TODO(loc, "multiple results not supported yet");
@@ -593,24 +562,6 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase<TargetRewrite> {
     }
   }
 
-  template <typename Ty>
-  void
-  lowerStructSignatureRes(mlir::Location loc, fir::RecordType recTy,
-                          Ty &newResTys,
-                          fir::CodeGenSpecifics::Marshalling &newInTyAndAttrs) {
-    if (noComplexConversion) {
-      newResTys.push_back(recTy);
-      return;
-    } else {
-      for (auto &tup : specifics->structReturnType(loc, recTy)) {
-        if (std::get<fir::CodeGenSpecifics::Attributes>(tup).isSRet())
-          newInTyAndAttrs.push_back(tup);
-        else
-          newResTys.push_back(std::get<mlir::Type>(tup));
-      }
-    }
-  }
-
   void
   lowerStructSignatureArg(mlir::Location loc, fir::RecordType recTy,
                           fir::CodeGenSpecifics::Marshalling &newInTyAndAttrs) {
@@ -644,9 +595,6 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase<TargetRewrite> {
           .Case<mlir::ComplexType>([&](mlir::ComplexType ty) {
             lowerComplexSignatureRes(loc, ty, newResTys, newInTyAndAttrs);
           })
-          .Case<fir::RecordType>([&](fir::RecordType ty) {
-            lowerStructSignatureRes(loc, ty, newResTys, newInTyAndAttrs);
-          })
           .Default([&](mlir::Type ty) { newResTys.push_back(ty); });
     }
     llvm::SmallVector<mlir::Type> trailingInTys;
@@ -748,8 +696,7 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase<TargetRewrite> {
     for (auto ty : func.getResults())
       if ((mlir::isa<fir::BoxCharType>(ty) && !noCharacterConversion) ||
           (fir::isa_complex(ty) && !noComplexConversion) ||
-          (mlir::isa<mlir::IntegerType>(ty) && hasCCallingConv) ||
-          (mlir::isa<fir::RecordType>(ty) && !noStructConversion)) {
+          (mlir::isa<mlir::IntegerType>(ty) && hasCCallingConv)) {
         LLVM_DEBUG(llvm::dbgs() << "rewrite " << signature << " for target\n");
         return false;
       }
@@ -823,9 +770,6 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase<TargetRewrite> {
                                                 rewriter->getUnitAttr()));
             newResTys.push_back(retTy);
           })
-          .Case<fir::RecordType>([&](fir::RecordType recTy) {
-            doStructReturn(func, recTy, newResTys, newInTyAndAttrs, fixups);
-          })
           .Default([&](mlir::Type ty) { newResTys.push_back(ty); });
 
     // Saved potential shift in argument. Handling of result can add arguments
@@ -1118,12 +1062,21 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase<TargetRewrite> {
     return false;
   }
 
+  /// Convert a complex return value. This can involve converting the return
+  /// value to a "hidden" first argument or packing the complex into a wide
+  /// GPR.
   template <typename Ty, typename FIXUPS>
-  void doReturn(mlir::func::FuncOp func, Ty &newResTys,
-                fir::CodeGenSpecifics::Marshalling &newInTyAndAttrs,
-                FIXUPS &fixups, fir::CodeGenSpecifics::Marshalling &m) {
-    assert(m.size() == 1 &&
-           "expect result to be turned into single argument or result so far");
+  void doComplexReturn(mlir::func::FuncOp func, mlir::ComplexType cmplx,
+                       Ty &newResTys,
+                       fir::CodeGenSpecifics::Marshalling &newInTyAndAttrs,
+                       FIXUPS &fixups) {
+    if (noComplexConversion) {
+      newResTys.push_back(cmplx);
+      return;
+    }
+    auto m =
+        specifics->complexReturnType(func.getLoc(), cmplx.getElementType());
+    assert(m.size() == 1);
     auto &tup = m[0];
     auto attr = std::get<fir::CodeGenSpecifics::Attributes>(tup);
     auto argTy = std::get<mlir::Type>(tup);
@@ -1164,36 +1117,6 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase<TargetRewrite> {
     newResTys.push_back(argTy);
   }
 
-  /// Convert a complex return value. This can involve converting the return
-  /// value to a "hidden" first argument or packing the complex into a wide
-  /// GPR.
-  template <typename Ty, typename FIXUPS>
-  void doComplexReturn(mlir::func::FuncOp func, mlir::ComplexType cmplx,
-                       Ty &newResTys,
-                       fir::CodeGenSpecifics::Marshalling &newInTyAndAttrs,
-                       FIXUPS &fixups) {
-    if (noComplexConversion) {
-      newResTys.push_back(cmplx);
-      return;
-    }
-    auto m =
-        specifics->complexReturnType(func.getLoc(), cmplx.getElementType());
-    doReturn(func, newResTys, newInTyAndAttrs, fixups, m);
-  }
-
-  template <typename Ty, typename FIXUPS>
-  void doStructReturn(mlir::func::FuncOp func, fir::RecordType recTy,
-                      Ty &newResTys,
-                      fir::CodeGenSpecifics::Marshalling &newInTyAndAttrs,
-                      FIXUPS &fixups) {
-    if (noStructConversion) {
-      newResTys.push_back(recTy);
-      return;
-    }
-    auto m = specifics->structReturnType(func.getLoc(), recTy);
-    doReturn(func, newResTys, newInTyAndAttrs, fixups, m);
-  }
-
   template <typename FIXUPS>
   void
   createFuncOpArgFixups(mlir::func::FuncOp func,
diff --git a/flang/lib/Optimizer/Transforms/AbstractResult.cpp b/flang/lib/Optimizer/Transforms/AbstractResult.cpp
index c0ec820d87ed44..7299ff80121e13 100644
--- a/flang/lib/Optimizer/Transforms/AbstractResult.cpp
+++ b/flang/lib/Optimizer/Transforms/AbstractResult.cpp
@@ -32,33 +32,6 @@ using namespace mlir;
 namespace fir {
 namespace {
 
-// Helper to only build the symbol table if needed because its build time is
-// linear on the number of symbols in the module.
-struct LazySymbolTable {
-  LazySymbolTable(mlir::Operation *op)
-      : module{op->getParentOfType<mlir::ModuleOp>()} {}
-  void build() {
-    if (table)
-      return;
-    table = std::make_unique<mlir::SymbolTable>(module);
-  }
-
-  template <typename T>
-  T lookup(llvm::StringRef name) {
-    build();
-    return table->lookup<T>(name);
-  }
-
-private:
-  std::unique_ptr<mlir::SymbolTable> table;
-  mlir::ModuleOp module;
-};
-
-bool hasScalarDerivedResult(mlir::FunctionType funTy) {
-  return funTy.getNumResults() == 1 &&
-         mlir::isa<fir::RecordType>(funTy.getResult(0));
-}
-
 static mlir::Type getResultArgumentType(mlir::Type resultType,
                                         bool shouldBoxResult) {
   return llvm::TypeSwitch<mlir::Type, mlir::Type>(resultType)
@@ -217,14 +190,7 @@ class SaveResultOpConversion
   llvm::LogicalResult
   matchAndRewrite(fir::SaveResultOp op,
                   mlir::PatternRewriter &rewriter) const override {
-    mlir::Operation *call = op.getValue().getDefiningOp();
-    if (mlir::isa<fir::RecordType>(op.getValue().getType()) && call &&
-        fir::hasBindcAttr(call)) {
-      rewriter.replaceOpWithNewOp<fir::StoreOp>(op, op.getValue(),
-                                                op.getMemref());
-    } else {
-      rewriter.eraseOp(op);
-    }
+    rewriter.eraseOp(op);
     return mlir::success();
   }
 };
@@ -334,12 +300,6 @@ class AbstractResultOpt
     auto *context = &getContext();
     // Convert function type itself if it has an abstract result.
     auto funcTy = mlir::cast<mlir::FunctionType>(func.getFunctionType());
-    // Scalar derived result of BIND(C) function must be returned according
-    // to the C struct return ABI which is target dependent and implemented in
-    // the target-rewrite pass.
-    if (hasScalarDerivedResult(funcTy) &&
-        fir::hasBindcAttr(func.getOperation()))
-      return;
     if (hasAbstractResult(funcTy)) {
       if (fir::isa_builtin_cptr_type(funcTy.getResult(0))) {
         func.setType(getCPtrFunctionType(funcTy));
@@ -435,8 +395,6 @@ class AbstractResultOpt
       return;
     }
 
-    LazySymbolTable symbolTable(op);
-
     mlir::RewritePatternSet patterns(context);
     mlir::ConversionTarget target = *context;
     const bool shouldBoxResult = this->passResultAsBox.getValue();
@@ -451,29 +409,14 @@ class AbstractResultOpt
                            mlir::func::FuncDialect>();
     target.addIllegalOp<fir::SaveResultOp>();
     target.addDynamicallyLegalOp<fir::CallOp>([](fir::CallOp call) {
-      mlir::FunctionType funTy = call.getFunctionType();
-      if (hasScalarDerivedResult(funTy) &&
-          fir::hasBindcAttr(call.getOperation()))
-        return true;
-      return !hasAbstractResult(funTy);
+      return !hasAbstractResult(call.getFunctionType());
     });
-    target.addDynamicallyLegalOp<fir::AddrOfOp>([&symbolTable](
-                                                    fir::AddrOfOp addrOf) {
-      if (auto funTy = mlir::dyn_cast<mlir::FunctionType>(addrOf.getType())) {
-        if (hasScalarDerivedResult(funTy)) {
-          auto func = symbolTable.lookup<mlir::func::FuncOp>(
-              addrOf.getSymbol().getRootReference().getValue());
-          return func && fir::hasBindcAttr(func.getOperation());
-        }
+    target.addDynamicallyLegalOp<fir::AddrOfOp>([](fir::AddrOfOp addrOf) {
+      if (auto funTy = mlir::dyn_cast<mlir::FunctionType>(addrOf.getType()))
         return !hasAbstractResult(funTy);
-      }
       return true;
     });
     target.addDynamicallyLegalOp<fir::DispatchOp>([](fir::DispatchOp dispatch) {
-      mlir::FunctionType funTy = dispatch.getFunctionType();
-      if (hasScalarDerivedResult(funTy) &&
-          fir::hasBindcAttr(dispatch.getOperation()))
-        return true;
       return !hasAbstractResult(dispatch.getFunctionType());
     });
 
diff --git a/flang/test/Fir/abstract-results-bindc.fir b/flang/test/Fir/abstract-results-bindc.fir
deleted file mode 100644
index 9b26730f7d2923..00000000000000
--- a/flang/test/Fir/abstract-results-bindc.fir
+++ /dev/null
@@ -1,43 +0,0 @@
-// Test that bind_c derived type results are not moved to a hidden argument
-// by the abstract-result pass. They will be dealt with according to the C
-// struct returning ABI for the target in the target-rewrite pass.
-// RUN: fir-opt %s --abstract-result | FileCheck %s
-
-!t = !fir.type<t{i:f32, j: i32, k: f32}>
-
-func.func private @foo() -> !t attributes {fir.proc_attrs = #fir.proc_attrs<bind_c>}
-
-func.func @test_call(%x: !fir.ref<!t>) {
-  %0 = fir.call @foo() proc_attrs<bind_c> : () -> !t
-  fir.save_result %0 to %x : !t, !fir.ref<!t>
-  return
-}
-
-func.func @test_addr_of() -> (() -> !t) {
-  %0 = fir.address_of(@foo) : () -> !t
-  return %0 : () -> !t
-}
-
-func.func @test_dispatch(%x: !fir.ref<!t>, %y : !fir.class<!fir.type<somet>>) {
-  %0 = fir.dispatch "bar"(%y : !fir.class<!fir.type<somet>>) (%y : !fir.class<!fir.type<somet>>) -> !t proc_attrs<bind_c> {pass_arg_pos = 0 : i32}
-  fir.save_result %0 to %x : !t, !fir.ref<!t>
-  return
-}
-
-// CHECK-LABEL:   func.func @test_call(
-// CHECK-SAME:                         %[[VAL_0:.*]]: !fir.ref<!fir.type<t{i:f32,j:i32,k:f32}>>) {
-// CHECK:           %[[VAL_1:.*]] = fir.call @foo() proc_attrs<bind_c> : () -> !fir.type<t{i:f32,j:i32,k:f32}>
-// CHECK:           fir.store %[[VAL_1]] to %[[VAL_0]] : !fir.ref<!fir.type<t{i:f32,j:i32,k:f32}>>
-// CHECK:           return
-// CHECK:         }
-// CHECK-LABEL:   func.func @test_addr_of() -> (() -> !fir.type<t{i:f32,j:i32,k:f32}>) {
-// CHECK:           %[[VAL_0:.*]] = fir.address_of(@foo) : () -> !fir.type<t{i:f32,j:i32,k:f32}>
-// CHECK:           return %[[VAL_0]] : () -> !fir.type<t{i:f32,j:i32,k:f32}>
-// CHECK:         }
-// CHECK-LABEL:   func.func @test_dispatch(
-// CHECK-SAME:                             %[[VAL_0:.*]]: !fir.ref<!fir.type<t{i:f32,j:i32,k:f32}>>,
-// CHECK-SAME:                             %[[VAL_1:.*]]: !fir.class<!fir.type<somet>>) {
-// CHECK:           %[[VAL_2:.*]] = fir.dispatch "bar"(%[[VAL_1]] : !fir.class<!fir.type<somet>>) (%[[VAL_1]] : !fir.class<!fir.type<somet>>) -> !fir.type<t{i:f32,j:i32,k:f32}> proc_attrs <bind_c> {pass_arg_pos = 0 : i32}
-// CHECK:           fir.store %[[VAL_2]] to %[[VAL_0]] : !fir.ref<!fir.type<t{i:f32,j:i32,k:f32}>>
-// CHECK:           return
-// CHECK:         }
diff --git a/flang/test/Fir/struct-return-x86-64.fir b/flang/test/Fir/struct-return-x86-64.fir
deleted file mode 100644
index f4c2add69ff7e9..00000000000000
--- a/flang/test/Fir/struct-return-x86-64.fir
+++ /dev/null
@@ -1,120 +0,0 @@
-// Test X86-64 ABI rewrite of struct returned by value (BIND(C), VALUE derived types).
-// REQUIRES: x86-registered-target
-// RUN: fir-opt --target-rewrite %s | FileCheck %s
-
-!fits_in_reg = !fir.type<t1{i:f32,j:i32,k:f32}>
-!too_big = !fir.type<t2{i:!fir.array<5xf32>}>
-
-module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
-
-  func.func private @test_inreg() -> !fits_in_reg
-  func.func @test_call_inreg(%arg0: !fir.ref<!fits_in_reg>) {
-    %0 = fir.call @test_inreg() : () -> !fits_in_reg
-    fir.store %0 to %arg0 : !fir.ref<!fits_in_reg>
-    return
-  }
-  func.func @test_addr_of_inreg() -> (() -> ()) {
-    %0 = fir.address_of(@test_inreg) : () -> !fits_in_reg
-    %1 = fir.convert %0 : (() -> !fits_in_reg) -> (() -> ())
-    return %1 : () -> ()
-  }
-  func.func @test_dispatch_inreg(%arg0: !fir.ref<!fits_in_reg>, %arg1: !fir.class<!fir.type<somet>>) {
-    %0 = fir.dispatch "bar"(%arg1 : !fir.class<!fir.type<somet>>) (%arg1 : !fir.class<!fir.type<somet>>) -> !fits_in_reg {pass_arg_pos = 0 : i32}
-    fir.store %0 to %arg0 : !fir.ref<!fits_in_reg>
-    return
-  }
-
-  func.func private @test_sret() -> !too_big
-  func.func @test_call_sret(%arg0: !fir.ref<!too_big>) {
-    %0 = fir.call @test_sret() : () -> !too_big
-    fir.store %0 to %arg0 : !fir.ref<!too_big>
-    return
-  }
-  func.func @test_addr_of_sret() -> (() -> ()) {
-    %0 = fir.address_of(@test_sret) : () -> !too_big
-    %1 = fir.convert %0 : (() -> !too_big) -> (() -> ())
-    return %1 : () -> ()
-  }
-  func.func @test_dispatch_sret(%arg0: !fir.ref<!too_big>, %arg1: !fir.class<!fir.type<somet>>) {
-    %0 = fir.dispatch "bar"(%arg1 : !fir.class<!fir.type<somet>>) (%arg1 : !fir.class<!fir.type<somet>>) -> !too_big {pass_arg_pos = 0 : i32}
-    fir.store %0 to %arg0 : !fir.ref<!too_big>
-    return
-  }
-  func.func private @test_fp_80() -> !fir.type<t3{i:f80}>
-  func.func private @test_complex_80() -> !fir.type<t4{i:complex<f80>}>
-  func.func private @test_two_fp_80() -> !fir.type<t5{i:f80,j:f80}>
-  func.func private @test_fp128() -> !fir.type<t6{i:f128}>
-}
-
-// CHECK-LABEL:   func.func private @test_inreg() -> tuple<i64, f32>
-
-// CHECK-LABEL:   func.func @test_call_inreg(
-// CHECK-SAME:                               %[[VAL_0:.*]]: !fir.ref<!fir.type<t1{i:f32,j:i32,k:f32}>>) {
-// CHECK:           %[[VAL_1:.*]] = fir.call @test_inreg() : () -> tuple<i64, f32>
-// CHECK:           %[[VAL_2:.*]] = llvm.intr.stacksave : !llvm.ptr
-// CHECK:           %[[VAL_3:.*]] = fir.alloca tuple<i64, f32>
-// CHECK:           fir.store %[[VAL_1]] to %[[VAL_3]] : !fir.ref<tuple<i64, f32>>
-// CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<tuple<i64, f32>>) -> !fir.ref<!fir.type<t1{i:f32,j:i32,k:f32}>>
-// CHECK:           %[[VAL_5:.*]] = fir.load %[[VAL_4]] : !fir.ref<!fir.type<t1{i:f32,j:i32,k:f32}>>
-// CHECK:           llvm.intr.stackrestore %[[VAL_2]] : !llvm.ptr
-// CHECK:           fir.store %[[VAL_5]] to %[[VAL_0]] : !fir.ref<!fir.type<t1{i:f32,j:i32,k:f32}>>
-// CHECK:           return
-// CHECK:         }
-
-// CHECK-LABEL:   func.func @test_addr_of_inreg() -> (() -> ()) {
-// CHECK:           %[[VAL_0:.*]] = fir.address_of(@test_inreg) : () -> tuple<i64, f32>
-// CHECK:           %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (() -> tuple<i64, f32>) -> (() -> ())
-// CHECK:           return %[[VAL_1]] : () -> ()
-// CHECK:         }
-
-// CHECK-LABEL:   func.func @test_dispatch_inreg(
-// CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.ref<!fir.type<t1{i:f32,j:i32,k:f32}>>,
-// CHECK-SAME:                                   %[[VAL_1:.*]]: !fir.class<!fir.type<somet>>) {
-// CHECK:           %[[VAL_2:.*]] = fir.dispatch "bar"(%[[VAL_1]] : !fir.class<!fir.type<somet>>) (%[[VAL_1]] : !fir.class<!fir.type<somet>>) -> tuple<i64, f32> {pass_arg_pos = 0 : i32}
-// CHECK:           %[[VAL_3:.*]] = llvm.intr.stacksave : !llvm.ptr
-// CHECK:           %[[VAL_4:.*]] = fir.alloca tuple<i64, f32>
-// CHECK:           fir.store %[[VAL_2]] to %[[VAL_4]] : !fir.ref<tuple<i64, f32>>
-// CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.ref<tuple<i64, f32>>) -> !fir.ref<!fir.type<t1{i:f32,j:i32,k:f32}>>
-// CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_5]] : !fir.ref<!fir.type<t1{i:f32,j:i32,k:f32}>>
-// CHECK:           llvm.intr.stackrestore %[[VAL_3]] : !llvm.ptr
-// CHECK:           fir.store %[[VAL_6]] to %[[VAL_0]] : !fir.ref<!fir.type<t1{i:f32,j:i32,k:f32}>>
-// CHECK:           return
-// CHECK:         }
-// CHECK:         func.func private @test_sret(!fir.ref<!fir.type<t2{i:!fir.array<5xf32>}>> {llvm.align = 8 : i32, llvm.sret = !fir.type<t2{i:!fir.array<5xf32>}>})
-
-// CHECK-LABEL:   func.func @test_call_sret(
-// CHECK-SAME:                              %[[VAL_0:.*]]: !fir.ref<!fir.type<t2{i:!fir.array<5xf32>}>>) {
-// CHECK:           %[[VAL_1:.*]] = llvm.intr.stacksave : !llvm.ptr
-// CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.type<t2{i:!fir.array<5xf32>}>
-// CHECK:           fir.call @test_sret(%[[VAL_2]]) : (!fir.ref<!fir.type<t2{i:!fir.array<5xf32>}>>) -> ()
-// CHECK:           %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.type<t2{i:!fir.array<5xf32>}>>) -> !fir.ref<!fir.type<t2{i:!fir.array<5xf32>}>>
-// CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_3]] : !fir.ref<!fir.type<t2{i:!fir.array<5xf32>}>>
-// CHECK:           llvm.intr.stackrestore %[[VAL_1]] : !llvm.ptr
-// CHECK:           fir.store %[[VAL_4]] to %[[VAL_0]] : !fir.ref<!fir.type<t2{i:!fir.array<5xf32>}>>
-// CHECK:           return
-// CHECK:         }
-
-// CHECK-LABEL:   func.func @test_addr_of_sret() -> (() -> ()) {
-// CHECK:           %[[VAL_0:.*]] = fir.address_of(@test_sret) : (!fir.ref<!fir.type<t2{i:!fir.array<5xf32>}>>) -> ()
-// CHECK:           %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : ((!fir.ref<!fir.type<t2{i:!fir.array<5xf32>}>>) -> ()) -> (() -> ())
-// CHECK:           return %[[VAL_1]] : () -> ()
-// CHECK:         }
-
-// CHECK-LABEL:   func.func @test_dispatch_sret(
-// CHECK-SAME:                                  %[[VAL_0:.*]]: !fir.ref<!fir.type<t2{i:!fir.array<5xf32>}>>,
-// CHECK-SAME:                                  %[[VAL_1:.*]]: !fir.class<!fir.type<somet>>) {
-// CHECK:           %[[VAL_2:.*]] = llvm.intr.stacksave : !llvm.ptr
-// CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.type<t2{i:!fir.array<5xf32>}>
-// CHECK:           fir.dispatch "bar"(%[[VAL_1]] : !fir.class<!fir.type<somet>>) (%[[VAL_3]], %[[VAL_1]] : !fir.ref<!fir.type<t2{i:!fir.array<5xf32>}>>, !fir.class<!fir.type<somet>>) {pass_arg_pos = 1 : i32}
-// CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<!fir.type<t2{i:!fir.array<5xf32>}>>) -> !fir.ref<!fir.type<t2{i:!fir.array<5xf32>}>>
-// CHECK:           %[[VAL_5:.*]] = fir.load %[[VAL_4]] : !fir.ref<!fir.type<t2{i:!fir.array<5xf32>}>>
-// CHECK:           llvm.intr.stackrestore %[[VAL_2]] : !llvm.ptr
-// CHECK:           fir.store %[[VAL_5]] to %[[VAL_0]] : !fir.ref<!fir.type<t2{i:!fir.array<5xf32>}>>
-// CHECK:           return
-// CHECK:         }
-
-
-// CHECK: func.func private @test_fp_80() -> f80
-// CHECK: func.func private @test_complex_80(!fir.ref<!fir.type<t4{i:complex<f80>}>> {llvm.align = 16 : i32, llvm.sret = !fir.type<t4{i:complex<f80>}>})
-// CHECK: func.func private @test_two_fp_80(!fir.ref<!fir.type<t5{i:f80,j:f80}>> {llvm.align = 16 : i32, llvm.sret = !fir.type<t5{i:f80,j:f80}>})
-// CHECK: func.func private @test_fp128() -> f128

From a3638f19bc04468c6db28a9cca50975229bfd45a Mon Sep 17 00:00:00 2001
From: Utkarsh Saxena <usx@google.com>
Date: Thu, 10 Oct 2024 17:31:53 +0200
Subject: [PATCH 031/177] [clang] Update string and string_view in
 lifetimebound tests (#111737)

Removes pragmas like `# 1 "<std>" 1 3` to make line numbers in failing
tests more accurate.
Use `basic_string_view` instead `string_view` to kick in GSL
owner/pointer auto inference.
---
 clang/test/SemaCXX/attr-lifetimebound.cpp | 33 +++++++++++------------
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/clang/test/SemaCXX/attr-lifetimebound.cpp b/clang/test/SemaCXX/attr-lifetimebound.cpp
index 0fb997a5671085..bdc58171917375 100644
--- a/clang/test/SemaCXX/attr-lifetimebound.cpp
+++ b/clang/test/SemaCXX/attr-lifetimebound.cpp
@@ -75,23 +75,26 @@ namespace usage_ok {
   }
 }
 
-# 1 "<std>" 1 3
 namespace std {
   using size_t = __SIZE_TYPE__;
-  struct string {
-    string();
-    string(const char*);
+  template<typename T>
+  struct basic_string {
+    basic_string();
+    basic_string(const T*);
 
     char &operator[](size_t) const [[clang::lifetimebound]];
   };
-  string operator""s(const char *, size_t);
-
-  struct string_view {
-    string_view();
-    string_view(const char *p [[clang::lifetimebound]]);
-    string_view(const string &s [[clang::lifetimebound]]);
+  using string =  basic_string<char>;
+  string operator""s(const char *, size_t); // expected-warning {{user-defined literal suffixes not starting with '_' are reserved}}
+
+  template<typename T>
+  struct basic_string_view {
+    basic_string_view();
+    basic_string_view(const T *p);
+    basic_string_view(const string &s [[clang::lifetimebound]]);
   };
-  string_view operator""sv(const char *, size_t);
+  using string_view = basic_string_view<char>;
+  string_view operator""sv(const char *, size_t); // expected-warning {{user-defined literal suffixes not starting with '_' are reserved}}
 
   struct vector {
     int *data();
@@ -100,7 +103,6 @@ namespace std {
 
   template<typename K, typename V> struct map {};
 }
-# 68 "attr-lifetimebound.cpp" 2
 
 using std::operator""s;
 using std::operator""sv;
@@ -112,7 +114,7 @@ namespace p0936r0_examples {
   void f() {
     std::string_view sv = "hi";
     std::string_view sv2 = sv + sv; // expected-warning {{temporary}}
-    sv2 = sv + sv; // FIXME: can we infer that we should warn here too?
+    sv2 = sv + sv; // expected-warning {{object backing the pointer}}
   }
 
   struct X { int a, b; };
@@ -238,11 +240,6 @@ template <class T> T *addressof(T &arg) {
         &const_cast<char &>(reinterpret_cast<const volatile char &>(arg)));
 }
 
-template<typename T>
-struct basic_string_view {
-  basic_string_view(const T *);
-};
-
 template <class T> struct span {
   template<size_t _ArrayExtent>
 	span(const T (&__arr)[_ArrayExtent]) noexcept;

From 9839b8cfb477866b8610714976cc6599f32f63e6 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 10 Oct 2024 19:32:02 +0400
Subject: [PATCH 032/177] llvm-reduce: Fix assert on invokes with catchswitch
 (#111838)

This is the minimal change to avoid the assert. There's an API flaw in
invoke instructions where getLandingPad assumes all invoke unwind
blocks have landingpads, when some have catchswitch instead.

Fixes #111817
---
 .../issue111817-catchswitch-assert.ll         | 53 +++++++++++++++++++
 .../llvm-reduce/deltas/ReduceBasicBlocks.cpp  | 13 ++++-
 2 files changed, 64 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/tools/llvm-reduce/issue111817-catchswitch-assert.ll

diff --git a/llvm/test/tools/llvm-reduce/issue111817-catchswitch-assert.ll b/llvm/test/tools/llvm-reduce/issue111817-catchswitch-assert.ll
new file mode 100644
index 00000000000000..cf20c8607ab2f3
--- /dev/null
+++ b/llvm/test/tools/llvm-reduce/issue111817-catchswitch-assert.ll
@@ -0,0 +1,53 @@
+; RUN: llvm-reduce -abort-on-invalid-reduction --delta-passes=basic-blocks --test FileCheck --test-arg --check-prefixes=CHECK-INTERESTINGNESS --test-arg %s --test-arg --input-file %s -o %t
+; RUN: FileCheck --check-prefix=CHECK-FINAL %s < %t
+
+; Make sure there's no assertion for invoke destinations that don't
+; use landingpad (and use catchswitch instead)
+
+; CHECK-INTERESTINGNESS: invoke
+
+; CHECK-FINAL: bb:
+; CHECK-FINAL-NEXT: invoke void @llvm.seh.try.begin()
+; CHECK-FINAL-NEXT:   to label %bb7 unwind label %bb1
+; CHECK-FINAL: bb1:
+; CHECK-FINAL-NEXT: %i = catchswitch within none [label %bb2] unwind to caller
+
+; CHECK-FINAL: bb2:
+; CHECK-FINAL-NEXT: %i3 = catchpad within %i [ptr null]
+; CHECK-FINAL-NEXT: ret ptr null
+
+; CHECK-FINAL-NOT: bb4
+; CHECK-FINAL-NOT: bb5
+
+; CHECK-FINAL: bb7:
+; CHECK-FINAL-NEXT: ret ptr null
+define ptr @func() personality ptr @__C_specific_handler {
+bb:
+  invoke void @llvm.seh.try.begin()
+          to label %bb7 unwind label %bb1
+
+bb1:                                              ; preds = %bb
+  %i = catchswitch within none [label %bb2] unwind to caller
+
+bb2:                                              ; preds = %bb1
+  %i3 = catchpad within %i [ptr null]
+  catchret from %i3 to label %bb4
+
+bb4:                                              ; preds = %bb2
+  invoke void @llvm.seh.try.end()
+          to label %bb7 unwind label %bb5
+
+bb5:                                              ; preds = %bb4
+  %i6 = cleanuppad within none []
+  cleanupret from %i6 unwind to caller
+
+bb7:                                              ; preds = %bb4, %bb
+  ret ptr null
+}
+
+declare void @llvm.seh.try.begin() #0
+declare void @llvm.seh.try.end() #0
+declare i32 @__C_specific_handler(...)
+
+attributes #0 = { nounwind willreturn memory(write) }
+
diff --git a/llvm/tools/llvm-reduce/deltas/ReduceBasicBlocks.cpp b/llvm/tools/llvm-reduce/deltas/ReduceBasicBlocks.cpp
index 6858dac9aeac41..41e3ffd963f5ba 100644
--- a/llvm/tools/llvm-reduce/deltas/ReduceBasicBlocks.cpp
+++ b/llvm/tools/llvm-reduce/deltas/ReduceBasicBlocks.cpp
@@ -45,12 +45,21 @@ static void replaceBranchTerminator(BasicBlock &BB,
   if (ChunkSuccessors.size() == Term->getNumSuccessors())
     return;
 
+  // TODO: Handle these without failing verifier.
+  if (isa<CatchSwitchInst>(Term))
+    return;
+
   bool IsBranch = isa<BranchInst>(Term);
   if (InvokeInst *Invoke = dyn_cast<InvokeInst>(Term)) {
-    LandingPadInst *LP = Invoke->getLandingPadInst();
+    BasicBlock *UnwindDest = Invoke->getUnwindDest();
+    Instruction *LP = UnwindDest->getFirstNonPHI();
+
     // Remove landingpad instruction if the containing block isn't used by other
     // invokes.
-    if (none_of(LP->getParent()->users(), [Invoke](User *U) {
+
+    // TODO: Handle catchswitch, catchpad, catchret, and cleanupret
+    if (isa<LandingPadInst>(LP) &&
+        none_of(UnwindDest->users(), [Invoke](User *U) {
           return U != Invoke && isa<InvokeInst>(U);
         })) {
       LP->replaceAllUsesWith(getDefaultValue(LP->getType()));

From c042d8f7b35ccb7add9c873c7e5d74f568cca115 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 10 Oct 2024 08:43:30 -0700
Subject: [PATCH 033/177] [RISCV] Use RISCVAsmPrinter::EmitToStreamer for
 EmitHwasanMemaccessSymbols. (#111792)

Add a MCSubtargetInfo& operand so we can control the subtarget for the
new calls. The old signature is kept as a wrapper to pass *STI to
maintain compatibility.

By using EmitToStreamer we are able to compress the instructions when
possible.
---
 llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp     | 184 ++++++++++--------
 .../CodeGen/RISCV/hwasan-check-memaccess.ll   |  45 +++++
 2 files changed, 148 insertions(+), 81 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index 384a7cf59f0632..5ad09ae7290fc5 100644
--- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -86,7 +86,11 @@ class RISCVAsmPrinter : public AsmPrinter {
                              const char *ExtraCode, raw_ostream &OS) override;
 
   // Returns whether Inst is compressed.
-  bool EmitToStreamer(MCStreamer &S, const MCInst &Inst);
+  bool EmitToStreamer(MCStreamer &S, const MCInst &Inst,
+                      const MCSubtargetInfo &SubtargetInfo);
+  bool EmitToStreamer(MCStreamer &S, const MCInst &Inst) {
+    return EmitToStreamer(S, Inst, *STI);
+  }
 
   bool lowerPseudoInstExpansion(const MachineInstr *MI, MCInst &Inst);
 
@@ -242,12 +246,13 @@ void RISCVAsmPrinter::LowerSTATEPOINT(MCStreamer &OutStreamer, StackMaps &SM,
   SM.recordStatepoint(*MILabel, MI);
 }
 
-bool RISCVAsmPrinter::EmitToStreamer(MCStreamer &S, const MCInst &Inst) {
+bool RISCVAsmPrinter::EmitToStreamer(MCStreamer &S, const MCInst &Inst,
+                                     const MCSubtargetInfo &SubtargetInfo) {
   MCInst CInst;
-  bool Res = RISCVRVC::compress(CInst, Inst, *STI);
+  bool Res = RISCVRVC::compress(CInst, Inst, SubtargetInfo);
   if (Res)
     ++RISCVNumInstrsCompressed;
-  S.emitInstruction(Res ? CInst : Inst, *STI);
+  S.emitInstruction(Res ? CInst : Inst, SubtargetInfo);
   return Res;
 }
 
@@ -662,87 +667,100 @@ void RISCVAsmPrinter::EmitHwasanMemaccessSymbols(Module &M) {
     OutStreamer->emitLabel(Sym);
 
     // Extract shadow offset from ptr
-    OutStreamer->emitInstruction(
+    EmitToStreamer(
+        *OutStreamer,
         MCInstBuilder(RISCV::SLLI).addReg(RISCV::X6).addReg(Reg).addImm(8),
         MCSTI);
-    OutStreamer->emitInstruction(MCInstBuilder(RISCV::SRLI)
-                                     .addReg(RISCV::X6)
-                                     .addReg(RISCV::X6)
-                                     .addImm(12),
-                                 MCSTI);
+    EmitToStreamer(*OutStreamer,
+                   MCInstBuilder(RISCV::SRLI)
+                       .addReg(RISCV::X6)
+                       .addReg(RISCV::X6)
+                       .addImm(12),
+                   MCSTI);
     // load shadow tag in X6, X5 contains shadow base
-    OutStreamer->emitInstruction(MCInstBuilder(RISCV::ADD)
-                                     .addReg(RISCV::X6)
-                                     .addReg(RISCV::X5)
-                                     .addReg(RISCV::X6),
-                                 MCSTI);
-    OutStreamer->emitInstruction(
+    EmitToStreamer(*OutStreamer,
+                   MCInstBuilder(RISCV::ADD)
+                       .addReg(RISCV::X6)
+                       .addReg(RISCV::X5)
+                       .addReg(RISCV::X6),
+                   MCSTI);
+    EmitToStreamer(
+        *OutStreamer,
         MCInstBuilder(RISCV::LBU).addReg(RISCV::X6).addReg(RISCV::X6).addImm(0),
         MCSTI);
     // Extract tag from pointer and compare it with loaded tag from shadow
-    OutStreamer->emitInstruction(
+    EmitToStreamer(
+        *OutStreamer,
         MCInstBuilder(RISCV::SRLI).addReg(RISCV::X7).addReg(Reg).addImm(56),
         MCSTI);
     MCSymbol *HandleMismatchOrPartialSym = OutContext.createTempSymbol();
     // X7 contains tag from the pointer, while X6 contains tag from memory
-    OutStreamer->emitInstruction(
-        MCInstBuilder(RISCV::BNE)
-            .addReg(RISCV::X7)
-            .addReg(RISCV::X6)
-            .addExpr(MCSymbolRefExpr::create(HandleMismatchOrPartialSym,
-                                             OutContext)),
-        MCSTI);
+    EmitToStreamer(*OutStreamer,
+                   MCInstBuilder(RISCV::BNE)
+                       .addReg(RISCV::X7)
+                       .addReg(RISCV::X6)
+                       .addExpr(MCSymbolRefExpr::create(
+                           HandleMismatchOrPartialSym, OutContext)),
+                   MCSTI);
     MCSymbol *ReturnSym = OutContext.createTempSymbol();
     OutStreamer->emitLabel(ReturnSym);
-    OutStreamer->emitInstruction(MCInstBuilder(RISCV::JALR)
-                                     .addReg(RISCV::X0)
-                                     .addReg(RISCV::X1)
-                                     .addImm(0),
-                                 MCSTI);
+    EmitToStreamer(*OutStreamer,
+                   MCInstBuilder(RISCV::JALR)
+                       .addReg(RISCV::X0)
+                       .addReg(RISCV::X1)
+                       .addImm(0),
+                   MCSTI);
     OutStreamer->emitLabel(HandleMismatchOrPartialSym);
 
-    OutStreamer->emitInstruction(MCInstBuilder(RISCV::ADDI)
-                                     .addReg(RISCV::X28)
-                                     .addReg(RISCV::X0)
-                                     .addImm(16),
-                                 MCSTI);
+    EmitToStreamer(*OutStreamer,
+                   MCInstBuilder(RISCV::ADDI)
+                       .addReg(RISCV::X28)
+                       .addReg(RISCV::X0)
+                       .addImm(16),
+                   MCSTI);
     MCSymbol *HandleMismatchSym = OutContext.createTempSymbol();
-    OutStreamer->emitInstruction(
+    EmitToStreamer(
+        *OutStreamer,
         MCInstBuilder(RISCV::BGEU)
             .addReg(RISCV::X6)
             .addReg(RISCV::X28)
             .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)),
         MCSTI);
 
-    OutStreamer->emitInstruction(
+    EmitToStreamer(
+        *OutStreamer,
         MCInstBuilder(RISCV::ANDI).addReg(RISCV::X28).addReg(Reg).addImm(0xF),
         MCSTI);
 
     if (Size != 1)
-      OutStreamer->emitInstruction(MCInstBuilder(RISCV::ADDI)
-                                       .addReg(RISCV::X28)
-                                       .addReg(RISCV::X28)
-                                       .addImm(Size - 1),
-                                   MCSTI);
-    OutStreamer->emitInstruction(
+      EmitToStreamer(*OutStreamer,
+                     MCInstBuilder(RISCV::ADDI)
+                         .addReg(RISCV::X28)
+                         .addReg(RISCV::X28)
+                         .addImm(Size - 1),
+                     MCSTI);
+    EmitToStreamer(
+        *OutStreamer,
         MCInstBuilder(RISCV::BGE)
             .addReg(RISCV::X28)
             .addReg(RISCV::X6)
             .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)),
         MCSTI);
 
-    OutStreamer->emitInstruction(
+    EmitToStreamer(
+        *OutStreamer,
         MCInstBuilder(RISCV::ORI).addReg(RISCV::X6).addReg(Reg).addImm(0xF),
         MCSTI);
-    OutStreamer->emitInstruction(
+    EmitToStreamer(
+        *OutStreamer,
         MCInstBuilder(RISCV::LBU).addReg(RISCV::X6).addReg(RISCV::X6).addImm(0),
         MCSTI);
-    OutStreamer->emitInstruction(
-        MCInstBuilder(RISCV::BEQ)
-            .addReg(RISCV::X6)
-            .addReg(RISCV::X7)
-            .addExpr(MCSymbolRefExpr::create(ReturnSym, OutContext)),
-        MCSTI);
+    EmitToStreamer(*OutStreamer,
+                   MCInstBuilder(RISCV::BEQ)
+                       .addReg(RISCV::X6)
+                       .addReg(RISCV::X7)
+                       .addExpr(MCSymbolRefExpr::create(ReturnSym, OutContext)),
+                   MCSTI);
 
     OutStreamer->emitLabel(HandleMismatchSym);
 
@@ -781,50 +799,54 @@ void RISCVAsmPrinter::EmitHwasanMemaccessSymbols(Module &M) {
     // +---------------------------------+ <-- [x2 / SP]
 
     // Adjust sp
-    OutStreamer->emitInstruction(MCInstBuilder(RISCV::ADDI)
-                                     .addReg(RISCV::X2)
-                                     .addReg(RISCV::X2)
-                                     .addImm(-256),
-                                 MCSTI);
+    EmitToStreamer(*OutStreamer,
+                   MCInstBuilder(RISCV::ADDI)
+                       .addReg(RISCV::X2)
+                       .addReg(RISCV::X2)
+                       .addImm(-256),
+                   MCSTI);
 
     // store x10(arg0) by new sp
-    OutStreamer->emitInstruction(MCInstBuilder(RISCV::SD)
-                                     .addReg(RISCV::X10)
-                                     .addReg(RISCV::X2)
-                                     .addImm(8 * 10),
-                                 MCSTI);
+    EmitToStreamer(*OutStreamer,
+                   MCInstBuilder(RISCV::SD)
+                       .addReg(RISCV::X10)
+                       .addReg(RISCV::X2)
+                       .addImm(8 * 10),
+                   MCSTI);
     // store x11(arg1) by new sp
-    OutStreamer->emitInstruction(MCInstBuilder(RISCV::SD)
-                                     .addReg(RISCV::X11)
-                                     .addReg(RISCV::X2)
-                                     .addImm(8 * 11),
-                                 MCSTI);
+    EmitToStreamer(*OutStreamer,
+                   MCInstBuilder(RISCV::SD)
+                       .addReg(RISCV::X11)
+                       .addReg(RISCV::X2)
+                       .addImm(8 * 11),
+                   MCSTI);
 
     // store x8(fp) by new sp
-    OutStreamer->emitInstruction(
+    EmitToStreamer(
+        *OutStreamer,
         MCInstBuilder(RISCV::SD).addReg(RISCV::X8).addReg(RISCV::X2).addImm(8 *
                                                                             8),
         MCSTI);
     // store x1(ra) by new sp
-    OutStreamer->emitInstruction(
+    EmitToStreamer(
+        *OutStreamer,
         MCInstBuilder(RISCV::SD).addReg(RISCV::X1).addReg(RISCV::X2).addImm(1 *
                                                                             8),
         MCSTI);
     if (Reg != RISCV::X10)
-      OutStreamer->emitInstruction(MCInstBuilder(RISCV::ADDI)
-                                       .addReg(RISCV::X10)
-                                       .addReg(Reg)
-                                       .addImm(0),
-                                   MCSTI);
-    OutStreamer->emitInstruction(
-        MCInstBuilder(RISCV::ADDI)
-            .addReg(RISCV::X11)
-            .addReg(RISCV::X0)
-            .addImm(AccessInfo & HWASanAccessInfo::RuntimeMask),
-        MCSTI);
-
-    OutStreamer->emitInstruction(MCInstBuilder(RISCV::PseudoCALL).addExpr(Expr),
-                                 MCSTI);
+      EmitToStreamer(
+          *OutStreamer,
+          MCInstBuilder(RISCV::ADDI).addReg(RISCV::X10).addReg(Reg).addImm(0),
+          MCSTI);
+    EmitToStreamer(*OutStreamer,
+                   MCInstBuilder(RISCV::ADDI)
+                       .addReg(RISCV::X11)
+                       .addReg(RISCV::X0)
+                       .addImm(AccessInfo & HWASanAccessInfo::RuntimeMask),
+                   MCSTI);
+
+    EmitToStreamer(*OutStreamer, MCInstBuilder(RISCV::PseudoCALL).addExpr(Expr),
+                   MCSTI);
   }
 }
 
diff --git a/llvm/test/CodeGen/RISCV/hwasan-check-memaccess.ll b/llvm/test/CodeGen/RISCV/hwasan-check-memaccess.ll
index 12c95206d21bed..dfd526c8964137 100644
--- a/llvm/test/CodeGen/RISCV/hwasan-check-memaccess.ll
+++ b/llvm/test/CodeGen/RISCV/hwasan-check-memaccess.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv64 < %s | FileCheck %s
 ; RUN: llc -mtriple=riscv64 --relocation-model=pic < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+c --riscv-no-aliases < %s \
+; RUN:   | FileCheck %s --check-prefix=COMPRESS
 
 define ptr @f2(ptr %x0, ptr %x1) {
 ; CHECK-LABEL: f2:
@@ -14,6 +16,18 @@ define ptr @f2(ptr %x0, ptr %x1) {
 ; CHECK-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
+;
+; COMPRESS-LABEL: f2:
+; COMPRESS:       # %bb.0:
+; COMPRESS-NEXT:    c.addi sp, -16
+; COMPRESS-NEXT:    .cfi_def_cfa_offset 16
+; COMPRESS-NEXT:    c.sdsp ra, 8(sp) # 8-byte Folded Spill
+; COMPRESS-NEXT:    .cfi_offset ra, -8
+; COMPRESS-NEXT:    c.mv t0, a1
+; COMPRESS-NEXT:    call __hwasan_check_x10_2_short
+; COMPRESS-NEXT:    c.ldsp ra, 8(sp) # 8-byte Folded Reload
+; COMPRESS-NEXT:    c.addi sp, 16
+; COMPRESS-NEXT:    c.jr ra
   call void @llvm.hwasan.check.memaccess.shortgranules(ptr %x1, ptr %x0, i32 2)
   ret ptr %x0
 }
@@ -50,3 +64,34 @@ declare void @llvm.hwasan.check.memaccess.shortgranules(ptr, ptr, i32)
 ; CHECK-NEXT: sd      ra, 8(sp)
 ; CHECK-NEXT: li      a1, 2
 ; CHECK-NEXT: call    __hwasan_tag_mismatch_v2
+
+; COMPRESS: .section        .text.hot,"axG",@progbits,__hwasan_check_x10_2_short,comdat
+; COMPRESS-NEXT: .type   __hwasan_check_x10_2_short,@function
+; COMPRESS-NEXT: .weak   __hwasan_check_x10_2_short
+; COMPRESS-NEXT: .hidden __hwasan_check_x10_2_short
+; COMPRESS-NEXT: __hwasan_check_x10_2_short:
+; COMPRESS-NEXT: slli    t1, a0, 8
+; COMPRESS-NEXT: srli    t1, t1, 12
+; COMPRESS-NEXT: c.add   t1, t0
+; COMPRESS-NEXT: lbu     t1, 0(t1)
+; COMPRESS-NEXT: srli    t2, a0, 56
+; COMPRESS-NEXT: bne     t2, t1, .Ltmp0
+; COMPRESS-NEXT: .Ltmp1:
+; COMPRESS-NEXT: c.jr    ra
+; COMPRESS-NEXT: .Ltmp0:
+; COMPRESS-NEXT: c.li    t3, 16
+; COMPRESS-NEXT: bgeu    t1, t3, .Ltmp2
+; COMPRESS-NEXT: andi    t3, a0, 15
+; COMPRESS-NEXT: c.addi  t3, 3
+; COMPRESS-NEXT: bge     t3, t1, .Ltmp2
+; COMPRESS-NEXT: ori     t1, a0, 15
+; COMPRESS-NEXT: lbu     t1, 0(t1)
+; COMPRESS-NEXT: beq     t1, t2, .Ltmp1
+; COMPRESS-NEXT: .Ltmp2:
+; COMPRESS-NEXT: c.addi16sp sp, -256
+; COMPRESS-NEXT: c.sdsp a0, 80(sp)
+; COMPRESS-NEXT: c.sdsp a1, 88(sp)
+; COMPRESS-NEXT: c.sdsp s0, 64(sp)
+; COMPRESS-NEXT: c.sdsp ra, 8(sp)
+; COMPRESS-NEXT: c.li    a1, 2
+; COMPRESS-NEXT: call    __hwasan_tag_mismatch_v2

From c77b10746160f985625603b1e9c837b44caa5c67 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Thu, 10 Oct 2024 08:47:30 -0700
Subject: [PATCH 034/177] [lldb] Introduce an always-on system log
 category/channel (#108495)

Add an "always on" log category and channel. Unlike other, existing log
channels, it is not exposed to users. The channel is meant to be used
sparsely and deliberately for logging high-value information to the
system log.

We have a similar concept in the downstream Swift fork and this has
proven to be extremely valuable. This is especially true on macOS where
system log messages are automatically captured as part of a sysdiagnose.
---
 lldb/include/lldb/Host/Host.h                 | 19 +++++++++++
 lldb/include/lldb/Utility/Log.h               | 11 +++---
 lldb/source/API/SystemInitializerFull.cpp     |  3 ++
 lldb/source/Host/common/Host.cpp              | 16 +++++++++
 lldb/source/Host/common/HostInfoBase.cpp      |  2 ++
 lldb/source/Utility/Log.cpp                   | 34 ++++++++++++-------
 lldb/test/Shell/Host/TestSytemLogChannel.test |  3 ++
 7 files changed, 70 insertions(+), 18 deletions(-)
 create mode 100644 lldb/test/Shell/Host/TestSytemLogChannel.test

diff --git a/lldb/include/lldb/Host/Host.h b/lldb/include/lldb/Host/Host.h
index 9d0994978402f7..d8113a5fceeada 100644
--- a/lldb/include/lldb/Host/Host.h
+++ b/lldb/include/lldb/Host/Host.h
@@ -31,6 +31,25 @@ class ProcessInstanceInfo;
 class ProcessInstanceInfoMatch;
 typedef std::vector<ProcessInstanceInfo> ProcessInstanceInfoList;
 
+// System log category and channel. This log channel is always enabled and
+// therefore is supposed to be used sparsely. Use this log channel to log
+// critical information that is expected to be relevant to the majority of bug
+// reports.
+enum class SystemLog : Log::MaskType {
+  System = Log::ChannelFlag<0>,
+  LLVM_MARK_AS_BITMASK_ENUM(System)
+};
+
+LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
+
+class LogChannelSystem {
+public:
+  static void Initialize();
+  static void Terminate();
+};
+
+template <> Log::Channel &LogChannelFor<SystemLog>();
+
 // Exit Type for inferior processes
 struct WaitStatus {
   enum Type : uint8_t {
diff --git a/lldb/include/lldb/Utility/Log.h b/lldb/include/lldb/Utility/Log.h
index 27707c17f9b824..ac6347153a1014 100644
--- a/lldb/include/lldb/Utility/Log.h
+++ b/lldb/include/lldb/Utility/Log.h
@@ -272,6 +272,12 @@ class Log final {
   void VAFormatf(llvm::StringRef file, llvm::StringRef function,
                  const char *format, va_list args);
 
+  void Enable(const std::shared_ptr<LogHandler> &handler_sp,
+              std::optional<MaskType> flags = std::nullopt,
+              uint32_t options = 0);
+
+  void Disable(std::optional<MaskType> flags = std::nullopt);
+
 private:
   Channel &m_channel;
 
@@ -297,11 +303,6 @@ class Log final {
     return m_handler;
   }
 
-  void Enable(const std::shared_ptr<LogHandler> &handler_sp, uint32_t options,
-              MaskType flags);
-
-  void Disable(MaskType flags);
-
   bool Dump(llvm::raw_ostream &stream);
 
   typedef llvm::StringMap<Log> ChannelMap;
diff --git a/lldb/source/API/SystemInitializerFull.cpp b/lldb/source/API/SystemInitializerFull.cpp
index 995d14f7c1fa1e..8a992a6889a91b 100644
--- a/lldb/source/API/SystemInitializerFull.cpp
+++ b/lldb/source/API/SystemInitializerFull.cpp
@@ -17,6 +17,7 @@
 #include "lldb/Interpreter/CommandInterpreter.h"
 #include "lldb/Target/ProcessTrace.h"
 #include "lldb/Utility/Timer.h"
+#include "lldb/Version/Version.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/TargetSelect.h"
 
@@ -83,6 +84,8 @@ llvm::Error SystemInitializerFull::Initialize() {
   // Use the Debugger's LLDBAssert callback.
   SetLLDBAssertCallback(Debugger::AssertCallback);
 
+  LLDB_LOG(GetLog(SystemLog::System), "{0}", GetVersion());
+
   return llvm::Error::success();
 }
 
diff --git a/lldb/source/Host/common/Host.cpp b/lldb/source/Host/common/Host.cpp
index f08adea6546ae1..abca6068d3604a 100644
--- a/lldb/source/Host/common/Host.cpp
+++ b/lldb/source/Host/common/Host.cpp
@@ -125,6 +125,22 @@ void Host::SystemLog(Severity severity, llvm::StringRef message) {
 #endif
 #endif
 
+static constexpr Log::Category g_categories[] = {
+    {{"system"}, {"system log"}, SystemLog::System}};
+
+static Log::Channel g_system_channel(g_categories, SystemLog::System);
+static Log g_system_log(g_system_channel);
+
+template <> Log::Channel &lldb_private::LogChannelFor<SystemLog>() {
+  return g_system_channel;
+}
+
+void LogChannelSystem::Initialize() {
+  g_system_log.Enable(std::make_shared<SystemLogHandler>());
+}
+
+void LogChannelSystem::Terminate() { g_system_log.Disable(); }
+
 #if !defined(__APPLE__) && !defined(_WIN32)
 static thread_result_t
 MonitorChildProcessThreadFunction(::pid_t pid,
diff --git a/lldb/source/Host/common/HostInfoBase.cpp b/lldb/source/Host/common/HostInfoBase.cpp
index 5c44c2f38b2879..89dfe4a9e9baa3 100644
--- a/lldb/source/Host/common/HostInfoBase.cpp
+++ b/lldb/source/Host/common/HostInfoBase.cpp
@@ -76,9 +76,11 @@ static HostInfoBase::SharedLibraryDirectoryHelper *g_shlib_dir_helper = nullptr;
 void HostInfoBase::Initialize(SharedLibraryDirectoryHelper *helper) {
   g_shlib_dir_helper = helper;
   g_fields = new HostInfoBaseFields();
+  LogChannelSystem::Initialize();
 }
 
 void HostInfoBase::Terminate() {
+  LogChannelSystem::Terminate();
   g_shlib_dir_helper = nullptr;
   delete g_fields;
   g_fields = nullptr;
diff --git a/lldb/source/Utility/Log.cpp b/lldb/source/Utility/Log.cpp
index f6b1381f63ad1c..3798f406476370 100644
--- a/lldb/source/Utility/Log.cpp
+++ b/lldb/source/Utility/Log.cpp
@@ -93,22 +93,28 @@ Log::MaskType Log::GetFlags(llvm::raw_ostream &stream,
 }
 
 void Log::Enable(const std::shared_ptr<LogHandler> &handler_sp,
-                 uint32_t options, Log::MaskType flags) {
+                 std::optional<Log::MaskType> flags, uint32_t options) {
   llvm::sys::ScopedWriter lock(m_mutex);
 
-  MaskType mask = m_mask.fetch_or(flags, std::memory_order_relaxed);
-  if (mask | flags) {
+  if (!flags)
+    flags = m_channel.default_flags;
+
+  MaskType mask = m_mask.fetch_or(*flags, std::memory_order_relaxed);
+  if (mask | *flags) {
     m_options.store(options, std::memory_order_relaxed);
     m_handler = handler_sp;
     m_channel.log_ptr.store(this, std::memory_order_relaxed);
   }
 }
 
-void Log::Disable(Log::MaskType flags) {
+void Log::Disable(std::optional<Log::MaskType> flags) {
   llvm::sys::ScopedWriter lock(m_mutex);
 
-  MaskType mask = m_mask.fetch_and(~flags, std::memory_order_relaxed);
-  if (!(mask & ~flags)) {
+  if (!flags)
+    flags = std::numeric_limits<MaskType>::max();
+
+  MaskType mask = m_mask.fetch_and(~(*flags), std::memory_order_relaxed);
+  if (!(mask & ~(*flags))) {
     m_handler.reset();
     m_channel.log_ptr.store(nullptr, std::memory_order_relaxed);
   }
@@ -230,10 +236,11 @@ bool Log::EnableLogChannel(const std::shared_ptr<LogHandler> &log_handler_sp,
     error_stream << llvm::formatv("Invalid log channel '{0}'.\n", channel);
     return false;
   }
-  MaskType flags = categories.empty()
-                       ? iter->second.m_channel.default_flags
-                       : GetFlags(error_stream, *iter, categories);
-  iter->second.Enable(log_handler_sp, log_options, flags);
+
+  auto flags = categories.empty() ? std::optional<MaskType>{}
+                                  : GetFlags(error_stream, *iter, categories);
+
+  iter->second.Enable(log_handler_sp, flags, log_options);
   return true;
 }
 
@@ -245,9 +252,10 @@ bool Log::DisableLogChannel(llvm::StringRef channel,
     error_stream << llvm::formatv("Invalid log channel '{0}'.\n", channel);
     return false;
   }
-  MaskType flags = categories.empty()
-                       ? std::numeric_limits<MaskType>::max()
-                       : GetFlags(error_stream, *iter, categories);
+
+  auto flags = categories.empty() ? std::optional<MaskType>{}
+                                  : GetFlags(error_stream, *iter, categories);
+
   iter->second.Disable(flags);
   return true;
 }
diff --git a/lldb/test/Shell/Host/TestSytemLogChannel.test b/lldb/test/Shell/Host/TestSytemLogChannel.test
new file mode 100644
index 00000000000000..4de699f0e09a4a
--- /dev/null
+++ b/lldb/test/Shell/Host/TestSytemLogChannel.test
@@ -0,0 +1,3 @@
+RUN: %lldb -o 'log list' -o 'log disable system' 2>&1 | FileCheck %s
+CHECK-NOT: Logging categories for 'system'
+CHECK: Invalid log channel 'system'

From f5aec03f6dd2f92590ecec9e3419b38b11d8476e Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Thu, 10 Oct 2024 15:50:35 +0000
Subject: [PATCH 035/177] [clang][analyzer][NFC] Fix strange bracket placement

---
 clang/lib/Analysis/ProgramPoint.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Analysis/ProgramPoint.cpp b/clang/lib/Analysis/ProgramPoint.cpp
index 2a91749affd2a6..768345c8425f02 100644
--- a/clang/lib/Analysis/ProgramPoint.cpp
+++ b/clang/lib/Analysis/ProgramPoint.cpp
@@ -157,7 +157,7 @@ void ProgramPoint::printJson(llvm::raw_ostream &Out, const char *NL) const {
             LHS->printJson(Out, nullptr, PP, AddQuotes);
           } else {
             Out << "null";
-	  }
+          }
 
           Out << ", \"rhs\": ";
           if (const Stmt *RHS = C->getRHS()) {

From 23309d7d9553af69b2912a159bc2e488acf69255 Mon Sep 17 00:00:00 2001
From: Stephen Tozer <stephen.tozer@sony.com>
Date: Thu, 10 Oct 2024 16:53:36 +0100
Subject: [PATCH 036/177] [Dexter] Set up ComInterface module to be imported
 correctly (#111850)

Fixes issue added by: https://github.com/llvm/llvm-project/pull/111833

Following the previous commit that changed how Dexter imports modules,
the ComInterface module import became broken. This is because it had a
different directory structure to other modules, where we want to import
single file rather than a dir containing a __init__.py. For this case,
an optional extra arg has been added to load_module allowing a filename
to be specified, letting us import ComInterface.py directly and fixing
the issue.
---
 .../dex/debugger/visualstudio/VisualStudio.py       |  4 +++-
 .../debuginfo-tests/dexter/dex/utils/Imports.py     | 13 +++++++++----
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/visualstudio/VisualStudio.py b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/visualstudio/VisualStudio.py
index 7cb56ec0c25a76..a6752274efac20 100644
--- a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/visualstudio/VisualStudio.py
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/visualstudio/VisualStudio.py
@@ -24,7 +24,9 @@
 def _load_com_module():
     try:
         return load_module(
-            "ComInterface", os.path.join(os.path.dirname(__file__), "windows")
+            "ComInterface",
+            os.path.join(os.path.dirname(__file__), "windows"),
+            "ComInterface.py",
         )
     except ImportError as e:
         raise LoadDebuggerException(e, sys.exc_info())
diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/utils/Imports.py b/cross-project-tests/debuginfo-tests/dexter/dex/utils/Imports.py
index ea052c21a18498..cd184f9d20ed8f 100644
--- a/cross-project-tests/debuginfo-tests/dexter/dex/utils/Imports.py
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/utils/Imports.py
@@ -1,12 +1,17 @@
-import importlib
+import importlib.util
 import os
 import sys
 
 
-def load_module(name, path):
-    spec = importlib.util.spec_from_file_location(
-        name, os.path.join(path, name, "__init__.py")
+def load_module(name, path, mod_file="__init__.py"):
+    # The module is either defined by a directory, in which case we search for
+    # `path/name/__init__.py`, or it is a single file at `path/mod_file`.
+    mod_path = (
+        os.path.join(path, name, mod_file)
+        if mod_file == "__init__.py"
+        else os.path.join(path, mod_file)
     )
+    spec = importlib.util.spec_from_file_location(name, mod_path)
     module = importlib.util.module_from_spec(spec)
     sys.modules[name] = module
     spec.loader.exec_module(module)

From 03483737a7a2d72a257a5ab6ff01748ad9cf0f75 Mon Sep 17 00:00:00 2001
From: Md Asghar Ahmad Shahid <md.asghar.ahmad.shahid@intel.com>
Date: Thu, 10 Oct 2024 21:30:58 +0530
Subject: [PATCH 037/177] [mlir][linalg] Introduce transpose semantic to
 'linalg.matmul' ops. (#104783)

The main goal of this patch is to extend the semantic of 'linalg.matmul'
named op to include per operand transpose semantic while also laying out
a way to move ops definition from OpDSL to tablegen. Hence, it is
implemented in tablegen. Transpose semantic is as follows.

By default 'linalg.matmul' behavior will remain as is. Transpose
semantics can be appiled on per input operand by specifying the optional
permutation attributes (namely 'permutationA' for 1st input and
'permutationB' for 2nd input) for each operand explicitly as needed. By
default, no transpose is mandated for any of the input operand.

    Example:
    ```
%val = linalg.matmul ins(%arg0, %arg1 : memref<5x3xf32>,
memref<5x7xf32>)
              outs(%arg2: memref<3x7xf32>)
              permutationA = [1, 0]
              permutationB = [0, 1]
    ```
---
 .../Dialect/Linalg/IR/LinalgInterfaces.td     |  10 +
 .../Linalg/IR/LinalgNamedStructuredOps.yaml   |  72 -----
 .../Dialect/Linalg/IR/LinalgStructuredOps.td  | 134 +++++++++
 .../Dialect/Linalg/IR/LinalgInterfaces.cpp    |  17 +-
 mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp      | 263 +++++++++++++++++-
 .../Linalg/Transforms/TransposeMatmul.cpp     |   7 +
 .../Linalg/Transforms/Vectorization.cpp       |   5 +
 .../NVGPU/TransformOps/NVGPUTransformOps.cpp  |   6 +
 .../linalg/opdsl/ops/core_named_ops.py        |  17 --
 .../Dialect/Linalg/generalize-named-ops.mlir  | 111 ++++++++
 mlir/test/Dialect/Linalg/invalid.mlir         | 159 +++++++++++
 mlir/test/Dialect/Linalg/named-ops.mlir       | 243 ++++++++++++++++
 mlir/test/python/dialects/linalg/ops.py       |  75 -----
 .../mlir-linalg-ods-yaml-gen.cpp              |   6 +-
 14 files changed, 943 insertions(+), 182 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td
index fbf3f19cde0e9b..e80dbb2afb9ef7 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td
@@ -684,6 +684,16 @@ def LinalgStructuredInterface
         return;
       }]
     >,
+    InterfaceMethod<
+      /*desc=*/[{
+        Return true if the user has supplied an explicit indexing maps for this op.
+      }],
+      /*retTy=*/"bool",
+      /*methodName=*/"hasUserDefinedMaps",
+      /*args=*/(ins),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{ return false; }]
+    >,
     //===------------------------------------------------------------------===//
     // Linalg generalization hooks.
     //===------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
index 8cb698096ef5b7..97b90333e2b200 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
@@ -1065,78 +1065,6 @@ structured_op: !LinalgStructuredOpConfig
         - !ScalarExpression
           scalar_arg: rhs
 --- !LinalgOpConfig
-metadata: !LinalgOpMetadata
-  name: matmul
-  cpp_class_name: MatmulOp
-  doc: |-
-    Performs a matrix multiplication of two 2D inputs.
-
-    Numeric casting is performed on the operands to the inner multiply, promoting
-    them to the same data type as the accumulator/output.
-  implements:
-  - LinalgContractionOpInterface
-structured_op: !LinalgStructuredOpConfig
-  args:
-  - !LinalgOperandDefConfig
-    name: A
-    kind: input_tensor
-    type_var: T1
-    shape_map: affine_map<()[s0, s1, s2] -> (s0, s1)>
-  - !LinalgOperandDefConfig
-    name: B
-    kind: input_tensor
-    type_var: T2
-    shape_map: affine_map<()[s0, s1, s2] -> (s1, s2)>
-  - !LinalgOperandDefConfig
-    name: C
-    kind: output_tensor
-    type_var: U
-    shape_map: affine_map<()[s0, s1, s2] -> (s0, s2)>
-  - !LinalgOperandDefConfig
-    name: cast
-    kind: type_fn_attr
-    default_fn: cast_signed
-  indexing_maps: !LinalgIndexingMapsConfig
-    static_indexing_maps:
-    - affine_map<(d0, d1, d2)[s0, s1, s2] -> (d0, d2)>
-    - affine_map<(d0, d1, d2)[s0, s1, s2] -> (d2, d1)>
-    - affine_map<(d0, d1, d2)[s0, s1, s2] -> (d0, d1)>
-  iterator_types:
-  - parallel
-  - parallel
-  - reduction
-  assignments:
-  - !ScalarAssign
-    arg: C
-    value: !ScalarExpression
-      scalar_fn:
-        kind: binary
-        fn_name: add
-        operands:
-        - !ScalarExpression
-          scalar_arg: C
-        - !ScalarExpression
-          scalar_fn:
-            kind: binary
-            fn_name: mul
-            operands:
-            - !ScalarExpression
-              scalar_fn:
-                kind: type
-                attr_name: cast
-                type_var: U
-                operands:
-                - !ScalarExpression
-                  scalar_arg: A
-            - !ScalarExpression
-              scalar_fn:
-                kind: type
-                attr_name: cast
-                type_var: U
-                operands:
-                - !ScalarExpression
-                  scalar_arg: B
---- !LinalgOpConfig
 metadata: !LinalgOpMetadata
   name: quantized_matmul
   cpp_class_name: QuantizedMatmulOp
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
index 31f29139247267..61d4fc9734c6de 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
@@ -535,6 +535,140 @@ def BroadcastOp : LinalgStructuredBase_Op<"broadcast", [
   let hasCanonicalizer = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// Op definition for MatmulOp
+//===----------------------------------------------------------------------===//
+
+def MatmulOp : LinalgStructuredBase_Op<"matmul", [
+               AttrSizedOperandSegments,
+               LinalgContractionOpInterface]> {
+    
+  let summary = [{
+    Performs a matrix multiplication of two 2D inputs without broadcast or transpose.
+    }];
+  let description = [{
+    Numeric casting is performed on the operands to the inner multiply,
+    promoting them to the same data type as the accumulator/output.
+
+    Broadcast and Transpose semantics can be appiled by specifying the explicit attribute
+    'indexing_maps' as shown below.This is a list attribute, so the list must include all
+    the maps if specified.
+
+    Example Transpose:
+    ```
+    linalg.matmul indexing_maps = [
+                   affine_map<(d0, d1, d2) -> (d2, d0)>, // transpose
+                   affine_map<(d0, d1, d2) -> (d2, d1)>,
+                   affine_map<(d0, d1, d2) -> (d0, d1)>
+                   ]
+                   ins(%arg0, %arg1 : memref<5x3xf32>,memref<5x7xf32>)
+                   outs(%arg2: memref<3x7xf32>)
+     ```
+
+    Example Broadcast:
+     ```
+    linalg.matmul indexing_maps = [
+                   affine_map<(d0, d1, d2) -> (d2)>,     // broadcast
+                   affine_map<(d0, d1, d2) -> (d2, d1)>,
+                   affine_map<(d0, d1, d2) -> (d0, d1)>
+                  ]
+                  ins(%arg0, %arg1 : memref<3xf32>, memref<5x7xf32>)
+                  outs(%arg2: memref<3x7xf32>)
+     ```
+
+     Example Broadcast and transpose:
+     ```
+     linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d2, d0)>, // transpose
+                       affine_map<(d0, d1, d2) -> (d2)>,     // broadcast
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                     ]
+                     ins(%arg0, %arg1 : memref<5x3xf32>, memref<7xf32>) outs(%arg2: memref<3x7xf32>)
+    }];
+
+    let arguments = (ins
+      Variadic<AnyType>:$inputs,
+      Variadic<AnyShaped>:$outputs,
+      DefaultValuedOptionalAttr<AffineMapArrayAttr, "{}">:$indexing_maps,
+      DefaultValuedOptionalAttr<TypeFnAttr, "TypeFn::cast_signed">:$cast
+    );
+    let results = (outs Variadic<AnyRankedTensor>:$result_tensors);
+    let regions = (region AnyRegion:$region);
+
+    let skipDefaultBuilders = 1;
+    let builders = [
+      OpBuilder<
+      (ins "ValueRange":$inputs, "ValueRange":$outputs,
+            CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes),
+      [{
+        buildStructuredOp($_builder, $_state, std::nullopt, inputs, outputs,
+          attributes, MatmulOp::getRegionBuilder());
+      }]>,
+      OpBuilder<
+      (ins "TypeRange":$resultTensorTypes, "ValueRange":$inputs,
+            "ValueRange":$outputs,
+            CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes),
+      [{
+        buildStructuredOp($_builder, $_state, resultTensorTypes,
+          inputs, outputs, attributes, MatmulOp::getRegionBuilder());
+      }]>,
+      OpBuilder<
+      (ins "TypeRange":$resultTensorTypes, "ValueRange":$operands,
+            CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes),
+      [{
+        $_state.addOperands(operands);
+        $_state.addAttributes(attributes);
+        $_state.addTypes(resultTensorTypes);
+        (void)$_state.addRegion();
+      }]>,
+      OpBuilder<
+      (ins "TypeRange":$resultTensorTypes, "ValueRange":$inputs,
+       "ValueRange":$outputs,
+       "Attribute":$cast, CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes),
+      [{
+        $_state.addAttribute("cast", cast);
+        buildStructuredOp($_builder, $_state, resultTensorTypes, inputs, outputs,
+          attributes, MatmulOp::getRegionBuilder());
+      }]>
+
+    ];
+    let hasCustomAssemblyFormat = 1;
+    let hasFolder = 1;
+    let hasVerifier = 1;
+
+    let extraClassDeclaration = structuredOpsBaseDecls # [{
+      SmallVector<utils::IteratorType> getIteratorTypesArray();
+
+      /// Implements the block region builder.
+      static void regionBuilder(ImplicitLocOpBuilder &b,
+                                Block &block, ArrayRef<NamedAttribute> attrs);
+
+      /// Returns a list of AffineMap with the typical matmul indexing charactristic.
+      SmallVector<AffineMap> getDefaultIndexingMaps();
+
+      /// Returns true if the given broadcast map \p bcastMap is valid for this op.
+      bool isValidLhsRhsBroadcastMap(AffineMap bcastMap);
+
+      static std::function<void(ImplicitLocOpBuilder &,
+                                Block &, ArrayRef<NamedAttribute>)>
+      getRegionBuilder() {
+        return regionBuilder;
+      }
+
+      ::mlir::MutableOperandRange getDpsInitsMutable() {
+        return getOutputsMutable();
+      }
+
+      // Generic methods.
+      static unsigned getNumRegionArgs();
+      std::string getLibraryCallName();
+      bool hasDynamicIndexingMaps();
+      /// Check if the op has broadcast and/or transpose semantic. Returns true if the
+      /// user defined indexing maps are not equal to default map.
+      bool hasUserDefinedMaps();
+    }];
+}
+
 //===----------------------------------------------------------------------===//
 // Named Linalg ops, implemented as a declarative configurations of generic ops.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
index 40795879c3026d..3b9194098fa783 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
@@ -15,13 +15,20 @@
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineExprVisitor.h"
 #include "mlir/IR/AffineMap.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/TypeUtilities.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/raw_ostream.h"
 #include <algorithm>
+#include <optional>
 
 using namespace mlir;
 using namespace mlir::linalg;
@@ -1142,7 +1149,6 @@ int64_t LinalgOp::getIndexingMapIndex(OpOperand *opOperand) {
 
 LogicalResult mlir::linalg::detail::verifyStructuredOpInterface(Operation *op) {
   LinalgOp linalgOp = cast<LinalgOp>(op);
-
   // Mixed tensor/buffer operands are not allowed.
   if (!linalgOp.hasPureTensorSemantics() &&
       !linalgOp.hasPureBufferSemantics() && op->getNumOperands() > 0)
@@ -1162,6 +1168,8 @@ LogicalResult mlir::linalg::detail::verifyStructuredOpInterface(Operation *op) {
            << ") to be equal to the number of input/output operands ("
            << linalgOp->getNumOperands() << ")";
 
+  // Set this flag if this op has user defined maps. This is required to guard
+  // the below error condition which assume default indexing maps.
   for (OpOperand &opOperand : linalgOp->getOpOperands()) {
     AffineMap indexingMap = linalgOp.getMatchingIndexingMap(&opOperand);
 
@@ -1178,13 +1186,13 @@ LogicalResult mlir::linalg::detail::verifyStructuredOpInterface(Operation *op) {
              << " dim(s) to match the number of loops";
 
     int64_t rank = linalgOp.getRank(&opOperand);
+
     if (indexingMap.getNumResults() != rank)
       return op->emitOpError("expected operand rank (")
              << rank << ") to match the result rank of indexing_map #"
              << opOperand.getOperandNumber() << " ("
              << indexingMap.getNumResults() << ")";
   }
-
   SmallVector<unsigned> redDims;
   linalgOp.getReductionDims(redDims);
 
@@ -1194,9 +1202,8 @@ LogicalResult mlir::linalg::detail::verifyStructuredOpInterface(Operation *op) {
   // Check if given shapes match to inferred shapes.
   SmallVector<int64_t, 4> endLoopRangeValues = linalgOp.getStaticLoopRanges();
   SmallVector<int64_t, 4> startLoopRangeValues(endLoopRangeValues.size(), 0);
-
-  // Verify only static cases since we can't get exact dimension sizes and loop
-  // ranges for dynamic cases in this stage.
+  // Verify only static cases since we can't get exact dimension sizes and
+  // loop ranges for dynamic cases in this stage.
   if (llvm::none_of(endLoopRangeValues, ShapedType::isDynamic)) {
     for (int64_t &range : endLoopRangeValues)
       range -= 1;
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index 730c478c2883ef..4f350ea236da84 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -27,6 +27,7 @@
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/IR/AffineExprVisitor.h"
 #include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypeInterfaces.h"
 #include "mlir/IR/Matchers.h"
@@ -37,12 +38,17 @@
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/LogicalResult.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
 #include <optional>
 
 using namespace mlir;
@@ -149,15 +155,36 @@ static void fillStructuredOpRegion(OpBuilder &opBuilder, Region &region,
   // iterator_types is an auto-generated method.
 }
 
+/// Helper to create a typical indexing map for MatmulOp. Returns a list of
+/// AffineMap.
+static SmallVector<AffineMap>
+getDefaultIndexingMapsForMatmul(MLIRContext *context) {
+  AffineExpr d0, d1, d2;
+  SmallVector<AffineMap, 3> indexingMaps;
+  bindDims(context, d0, d1, d2);
+  indexingMaps.push_back(AffineMap::get(3, 0, {d0, d2}, context));
+  indexingMaps.push_back(AffineMap::get(3, 0, {d2, d1}, context));
+  indexingMaps.push_back(AffineMap::get(3, 0, {d0, d1}, context));
+  return indexingMaps;
+}
+
+/// Wrapper to return the typical indexing map array attribute for MatmulOp.
+static SmallVector<Attribute> getDefaultIndexingMapAttr(MLIRContext *context) {
+  return llvm::map_to_vector(
+      getDefaultIndexingMapsForMatmul(context),
+      [](AffineMap map) -> Attribute { return AffineMapAttr::get(map); });
+}
+
 /// Creates a structured operation given `inputs`, `outputs`, and `attributes`.
 /// The result types are derived automatically if `resultTensorTypes` is none.
 /// The body of the operation is filled using `regionBuilder`. All ods-gen
 /// created structured operations use the method to implement their builders.
-static void buildStructuredOp(OpBuilder &b, OperationState &state,
-                              std::optional<TypeRange> resultTensorTypes,
-                              ValueRange inputs, ValueRange outputs,
-                              ArrayRef<NamedAttribute> attributes,
-                              RegionBuilderFn regionBuilder) {
+static void buildStructuredOp(
+    OpBuilder &b, OperationState &state,
+    std::optional<TypeRange> resultTensorTypes, ValueRange inputs,
+    ValueRange outputs, ArrayRef<NamedAttribute> attributes,
+    RegionBuilderFn regionBuilder,
+    std::optional<ArrayRef<AffineMap>> indexingMaps = std::nullopt) {
   // Derive the result types if needed.
   SmallVector<Type> derivedResultTypes =
       resultTensorTypes.value_or(TypeRange());
@@ -168,6 +195,20 @@ static void buildStructuredOp(OpBuilder &b, OperationState &state,
   state.addOperands(inputs);
   state.addOperands(outputs);
   state.addTypes(derivedResultTypes);
+
+  // Initialize indexingMaps, for MatmulOp.
+  SmallVector<Attribute, 3> indexingMapsAttrVal;
+  if (indexingMaps.has_value()) {
+    for (mlir::AffineMap map : *indexingMaps) {
+      // Convert each AffineMap to an AffineMapAttr
+      indexingMapsAttrVal.push_back(AffineMapAttr::get(map));
+    }
+    state.addAttribute("indexing_maps", b.getArrayAttr(indexingMapsAttrVal));
+  } else {
+    indexingMapsAttrVal = getDefaultIndexingMapAttr(b.getContext());
+    state.addAttribute("indexing_maps", b.getArrayAttr(indexingMapsAttrVal));
+  }
+
   state.addAttributes(attributes);
   state.addAttribute(
       "operandSegmentSizes",
@@ -299,11 +340,48 @@ static ParseResult parseNamedStructuredOp(OpAsmParser &parser,
                                           OperationState &result,
                                           unsigned numRegionArgs,
                                           RegionBuilderFn regionBuilder) {
+
+  SmallVector<Attribute, 3> indexingMapsAttr;
+  Attribute mapAttr;
+  if (succeeded(parser.parseOptionalKeyword("indexing_maps"))) {
+    if (parser.parseEqual())
+      return failure();
+
+    if (parser.parseLSquare())
+      return failure();
+
+    do {
+      if (parser.parseAttribute(mapAttr))
+        return failure();
+      if (!isa<AffineMapAttr>(mapAttr)) {
+        return parser.emitError(parser.getCurrentLocation(),
+                                "expected affine map attribute");
+      }
+      indexingMapsAttr.push_back(mapAttr);
+
+      if (parser.parseOptionalComma())
+        break;
+    } while (true);
+
+    if (parser.parseRSquare())
+      return failure();
+  }
+  // Initialize indexingMaps, if not supplied explicitly.
+  if (indexingMapsAttr.empty()) {
+    indexingMapsAttr = getDefaultIndexingMapAttr(result.getContext());
+  }
+  result.addAttribute("indexing_maps",
+                      parser.getBuilder().getArrayAttr(indexingMapsAttr));
+
   // TODO: Enable when ods-gen supports captures.
   SmallVector<Type, 1> inputTypes, outputTypes;
   if (parseCommonStructuredOpParts(parser, result, inputTypes, outputTypes))
     return failure();
 
+  // Parse optional attributes.
+  if (parser.parseOptionalAttrDict(result.attributes))
+    return failure();
+
   // TODO: consider merging results parsing into region parsing.
   // Need to wait for declarative assembly resolution to decide.
   SmallVector<Type, 1> outputTensorsTypes;
@@ -329,13 +407,9 @@ static void printNamedStructuredOpResults(OpAsmPrinter &p,
 }
 
 static void printNamedStructuredOp(OpAsmPrinter &p, Operation *op,
-                                   ValueRange inputs, ValueRange outputs) {
-  p.printOptionalAttrDict(
-      op->getAttrs(),
-      /*elidedAttrs=*/{"operandSegmentSizes",
-                       // See generated code in
-                       // LinalgNamedStructuredOps.yamlgen.cpp.inc
-                       "linalg.memoized_indexing_maps"});
+                                   ValueRange inputs, ValueRange outputs,
+                                   ArrayRef<StringRef> elidedAttrs = {}) {
+  p.printOptionalAttrDict(op->getAttrs(), elidedAttrs);
 
   // Printing is shared with generic ops, except for the region and
   // attributes.
@@ -3382,3 +3456,168 @@ Operation *LinalgDialect::materializeConstant(OpBuilder &builder,
                                               Location loc) {
   return arith::ConstantOp::materialize(builder, value, type, loc);
 }
+
+/// Returns true if the result AffineExpr of the \p explicitMap is same as \p
+/// defaultMap.
+static bool isValidResultDimExprs(AffineMap explictMap, AffineMap defaultMap) {
+  auto explicitRange = explictMap.getResults();
+  auto defaultRange = defaultMap.getResults();
+  DenseSet<AffineExpr> explicitSet(explicitRange.begin(), explicitRange.end());
+  DenseSet<AffineExpr> defaultSet(defaultRange.begin(), defaultRange.end());
+  llvm::set_union(explicitSet, defaultSet);
+  return explicitSet == defaultSet;
+}
+
+/// Returns true if the \p explictMap is broadcasted with respect to the
+/// \p defaultMap.
+static bool isBroadcasted(AffineMap explictMap, AffineMap defaultMap) {
+  return explictMap.getNumResults() < defaultMap.getNumResults();
+}
+
+/// Verifies the broadcast and transpose semantic sepecified by the explicit
+/// indexing map for the MatmulOp \p op for each operand specified by \p
+/// opIndex.
+static LogicalResult verifyExtendedMatmulSemantic(MatmulOp matmulOp,
+                                                  unsigned opIndex) {
+  SmallVector<AffineMap, 3> opIndexingMaps = matmulOp.getIndexingMapsArray();
+  SmallVector<AffineMap, 3> defaultIndexingMaps =
+      matmulOp.getDefaultIndexingMaps();
+
+  auto opIndexingMap = opIndexingMaps[opIndex];
+  auto defaultIndexingMap = defaultIndexingMaps[opIndex];
+  // Check general validity of indexing map results.
+  if (!isValidResultDimExprs(opIndexingMap, defaultIndexingMap))
+    return matmulOp->emitOpError()
+           << "Unexpected dim expression in map result.";
+
+  // Check if the requested broadcast is valid.
+  if (isBroadcasted(opIndexingMap, defaultIndexingMap)) {
+    if (!matmulOp.isValidLhsRhsBroadcastMap(opIndexingMap)) {
+      return matmulOp->emitOpError()
+             << "Invalid broadcast requested, should be (d2).";
+    }
+    return success();
+  }
+  return success();
+}
+
+namespace mlir {
+namespace linalg {
+//===----------------------------------------------------------------------===//
+// MatMulOp
+//===----------------------------------------------------------------------===//
+SmallVector<utils::IteratorType> MatmulOp::getIteratorTypesArray() {
+  return SmallVector<utils::IteratorType>{utils::IteratorType::parallel,
+                                          utils::IteratorType::parallel,
+                                          utils::IteratorType::reduction};
+}
+
+unsigned MatmulOp::getNumRegionArgs() { return 3; }
+
+std::string MatmulOp::getLibraryCallName() {
+  return generateLibraryCallName(getOperation());
+}
+
+bool MatmulOp::hasDynamicIndexingMaps() { return true; }
+
+/// Check if the op has broadcast and/or transpose semantic. Returns true if the
+/// user defined indexing maps are not equal to default map.
+bool MatmulOp::hasUserDefinedMaps() {
+  SmallVector<AffineMap, 3> defaultMaps = getDefaultIndexingMaps();
+  SmallVector<AffineMap, 3> explicitMaps = getIndexingMapsArray();
+  return defaultMaps != explicitMaps;
+}
+
+/// Implements the block region builder for the MatmulOp. This is called by
+/// 'fillStructuredOpRegion'.
+void MatmulOp::regionBuilder(ImplicitLocOpBuilder &b, Block &block,
+                             ArrayRef<NamedAttribute> attrs) {
+  assert(3 > 0 && block.getNumArguments() == 3 &&
+         "MatmulOp regionBuilder expects 3 (>=0) args");
+  RegionBuilderHelper helper(b, block);
+  SmallVector<Value> yields;
+
+  TypeFn castVal = TypeFn::cast_signed;
+  auto castIter = llvm::find_if(attrs, [&](const NamedAttribute &attr) {
+    return attr.getName() == "cast";
+  });
+  if (castIter != attrs.end()) {
+    if (auto attr = llvm::dyn_cast<TypeFnAttr>(castIter->getValue()))
+      castVal = attr.getValue();
+  }
+
+  Value value1 = helper.buildTypeFn(castVal, block.getArgument(2).getType(),
+                                    block.getArgument(0));
+  Value value2 = helper.buildTypeFn(castVal, block.getArgument(2).getType(),
+                                    block.getArgument(1));
+  Value value3 = helper.buildBinaryFn(BinaryFn::mul, value1, value2);
+  Value value4 =
+      helper.buildBinaryFn(BinaryFn::add, block.getArgument(2), value3);
+  yields.push_back(value4);
+  helper.yieldOutputs(yields);
+}
+
+/// Returns a list of AffineMap with the typical matmul indexing charactristic.
+SmallVector<AffineMap> MatmulOp::getDefaultIndexingMaps() {
+  MLIRContext *context = this->getContext();
+  return getDefaultIndexingMapsForMatmul(context);
+}
+
+/// Returns true if the given broadcast map \p bcastMap is valid for this op.
+bool MatmulOp::isValidLhsRhsBroadcastMap(AffineMap bcastMap) {
+  assert(bcastMap.getNumResults() == 1 && "Expected single result dim expr.");
+  AffineExpr exp = bcastMap.getResult(0);
+  // Invalid map if the common dimension of matmul not found.
+  return exp.isFunctionOfDim(bcastMap.getNumDims() - 1);
+}
+
+ParseResult MatmulOp::parse(OpAsmParser &parser, OperationState &result) {
+  return parseNamedStructuredOp(parser, result, MatmulOp::getNumRegionArgs(),
+                                MatmulOp::getRegionBuilder());
+}
+void MatmulOp::print(OpAsmPrinter &p) {
+  SmallVector<StringRef, 3> elidedAttrs = {
+      "operandSegmentSizes", "linalg.memoized_indexing_maps", "indexing_maps"};
+  printNamedStructuredOp(p, getOperation(), getInputs(), getOutputs(),
+                         elidedAttrs);
+
+  SmallVector<Attribute, 3> indexingMaps =
+      getDefaultIndexingMapAttr(getContext());
+  if (!llvm::equal(getIndexingMaps(), indexingMaps)) {
+    p << " indexing_maps = [";
+    llvm::interleaveComma(getIndexingMaps(), p,
+                          [&](Attribute attr) { p.printAttribute(attr); });
+    p << "]";
+  }
+}
+
+/// Verify the user defined indexing maps.
+LogicalResult MatmulOp::verify() {
+  // Verification of pure matmul is handled by verifyStructuredOpInterface().
+  if (!hasUserDefinedMaps())
+    return success();
+
+  for (unsigned opIndex = 0; opIndex < 2; opIndex++) {
+    if (failed(verifyExtendedMatmulSemantic(*this, opIndex)))
+      return failure();
+  }
+  return success();
+}
+
+LogicalResult MatmulOp::fold(FoldAdaptor, SmallVectorImpl<OpFoldResult> &) {
+  return memref::foldMemRefCast(*this);
+}
+void MatmulOp::getEffects(
+    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
+        &effects) {
+  if (hasPureTensorSemantics())
+    return;
+  getGenericEffectsImpl(effects, cast<LinalgOp>(getOperation()));
+}
+
+Speculation::Speculatability MatmulOp::getSpeculatability() {
+  return getGenericSpeculatabilityImpl(cast<LinalgOp>(getOperation()));
+}
+
+} // namespace linalg
+} // namespace mlir
diff --git a/mlir/lib/Dialect/Linalg/Transforms/TransposeMatmul.cpp b/mlir/lib/Dialect/Linalg/Transforms/TransposeMatmul.cpp
index aa0052ce47fa7b..6b934f7e8157d4 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/TransposeMatmul.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/TransposeMatmul.cpp
@@ -31,6 +31,13 @@ using namespace mlir::linalg;
 FailureOr<Operation *> mlir::linalg::transposeMatmul(RewriterBase &rewriter,
                                                      linalg::MatmulOp matmulOp,
                                                      bool transposeLHS) {
+  // Check to not let go the matmul with extended semantic, through this
+  // transform.
+  if (matmulOp.hasUserDefinedMaps()) {
+    return rewriter.notifyMatchFailure(
+        matmulOp, "only matmul ops with non-extended semantics are supported");
+  }
+
   if (!bufferization::hasTensorSemantics(matmulOp))
     return rewriter.notifyMatchFailure(
         matmulOp, "only matmul ops with tensors are supported");
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 09c6b2683b4388..e3f010d9cfb20b 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -2071,6 +2071,11 @@ vectorizeScalableVectorPrecondition(Operation *op,
       return failure();
   }
 
+  // Check to not let go the matmul with extended semantic, through this
+  // transform.
+  if (linalgOp.hasUserDefinedMaps())
+    return failure();
+
   // Cond 4: Only the following ops are supported in the
   // presence of scalable vectors
   return success(isElementwise(linalgOp) || isa<linalg::MatmulOp>(op) ||
diff --git a/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp b/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp
index 0c2275bbc4b224..3c508ed6e324b2 100644
--- a/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp
+++ b/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp
@@ -821,6 +821,12 @@ DiagnosedSilenceableFailure transform::RewriteMatmulAsMmaSyncOp::applyToOne(
   bool fail = true;
   // TODO: more robust detection of matmulOp, with transposes etc.
   if (isa_and_nonnull<linalg::MatmulOp>(linalgOp.getOperation())) {
+    // Check to not let go the matmul with extended semantic, through this
+    // transform.
+    if (linalgOp.hasUserDefinedMaps()) {
+      return emitSilenceableError()
+             << "only matmul ops with non-extended semantics are supported";
+    }
     Location loc = linalgOp.getLoc();
     // TODO: more robust computation of laneId, for now assume a single warp.
     Value laneId = rewriter.create<gpu::ThreadIdOp>(
diff --git a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
index e4a6ec7487bb2f..d5e79b4d3cb6dd 100644
--- a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
+++ b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
@@ -383,23 +383,6 @@ def select(
     O[None] = TernaryFn.select(cond[None], lhs[None], rhs[None])
 
 
-@linalg_structured_op
-def matmul(
-    A=TensorDef(T1, S.M, S.K),
-    B=TensorDef(T2, S.K, S.N),
-    C=TensorDef(U, S.M, S.N, output=True),
-    cast=TypeFnAttrDef(default=TypeFn.cast_signed),
-):
-    """Performs a matrix multiplication of two 2D inputs.
-
-    Numeric casting is performed on the operands to the inner multiply, promoting
-    them to the same data type as the accumulator/output.
-    """
-    domain(D.m, D.n, D.k)
-    implements(ContractionOpInterface)
-    C[D.m, D.n] += cast(U, A[D.m, D.k]) * cast(U, B[D.k, D.n])
-
-
 @linalg_structured_op
 def quantized_matmul(
     A=TensorDef(T1, S.M, S.K),
diff --git a/mlir/test/Dialect/Linalg/generalize-named-ops.mlir b/mlir/test/Dialect/Linalg/generalize-named-ops.mlir
index 1e8f1435ca0fa5..aba26c35931fd3 100644
--- a/mlir/test/Dialect/Linalg/generalize-named-ops.mlir
+++ b/mlir/test/Dialect/Linalg/generalize-named-ops.mlir
@@ -29,6 +29,34 @@ func.func @generalize_matmul_buffer(%A : memref<16x8xf32>, %B: memref<8x32xf32>,
 
 // -----
 
+func.func @matmul_bcast_a(%arg0: memref<5xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) {
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d2)>,
+                       affine_map<(d0, d1, d2) -> (d2, d1)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                     ]
+                     ins(%arg0, %arg1 : memref<5xf32>, memref<5x7xf32>) outs(%arg2: memref<3x7xf32>)
+  return
+}
+
+// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2)>
+// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
+// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+// CHECK-LABEL:   func.func @matmul_bcast_a(
+// CHECK-SAME:                              %[[VAL_0:.*]]: memref<5xf32>,
+// CHECK-SAME:                              %[[VAL_1:.*]]: memref<5x7xf32>,
+// CHECK-SAME:                              %[[VAL_2:.*]]: memref<3x7xf32>) {
+// CHECK:           linalg.generic {indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]], iterator_types = ["parallel", "parallel", "reduction"]} ins(%[[VAL_0]], %[[VAL_1]] : memref<5xf32>, memref<5x7xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) {
+// CHECK:           ^bb0(%[[VAL_3:.*]]: f32, %[[VAL_4:.*]]: f32, %[[VAL_5:.*]]: f32):
+// CHECK:             %[[VAL_6:.*]] = arith.mulf %[[VAL_3]], %[[VAL_4]] : f32
+// CHECK:             %[[VAL_7:.*]] = arith.addf %[[VAL_5]], %[[VAL_6]] : f32
+// CHECK:             linalg.yield %[[VAL_7]] : f32
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+
+// -----
+
 func.func @generalize_matmul_tensor(%A : tensor<16x8xf32>, %B: tensor<8x32xf32>, %C: tensor<16x32xf32>) -> tensor<16x32xf32> {
   %0 = linalg.matmul ins(%A, %B: tensor<16x8xf32>, tensor<8x32xf32>)
                     outs(%C: tensor<16x32xf32>) -> tensor<16x32xf32>
@@ -891,3 +919,86 @@ func.func @fill_tensor(%f: f32, %v: vector<2x4xf32>) -> (tensor<f32>, tensor<vec
 
   return %0, %1: tensor<f32>, tensor<vector<2x4xf32>>
 }
+
+// -----
+
+// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2, d0)>
+// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
+// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+
+// CHECK-LABEL:   func.func @matmul_transpose_a_explicit(
+// CHECK-SAME:                                  %[[VAL_0:.*]]: memref<5x3xf32>,
+// CHECK-SAME:                                  %[[VAL_1:.*]]: memref<5x7xf32>,
+// CHECK-SAME:                                  %[[VAL_2:.*]]: memref<3x7xf32>) {
+
+// CHECK:           linalg.generic {indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]], iterator_types = ["parallel", "parallel", "reduction"]}
+// CHECK:           arith.mulf
+// CHECK:           arith.addf
+
+func.func @matmul_transpose_a_explicit(%arg0: memref<5x3xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) {
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d2, d0)>,
+                       affine_map<(d0, d1, d2) -> (d2, d1)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                      ]
+                      ins(%arg0, %arg1 : memref<5x3xf32>, memref<5x7xf32>)
+                      outs(%arg2: memref<3x7xf32>)
+                      
+  return
+}
+
+// -----
+
+// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
+// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+// CHECK-LABEL:   func.func @matmul_transpose_b_explicit(
+// CHECK-SAME:                                           %[[VAL_0:.*]]: memref<3x5xf32>,
+// CHECK-SAME:                                           %[[VAL_1:.*]]: memref<7x5xf32>,
+// CHECK-SAME:                                           %[[VAL_2:.*]]: memref<3x7xf32>) {
+
+// CHECK:           linalg.generic {indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]], iterator_types = ["parallel", "parallel", "reduction"]}
+// CHECK:           arith.mulf
+// CHECK:           arith.addf
+
+func.func @matmul_transpose_b_explicit(%arg0: memref<3x5xf32>, %arg1: memref<7x5xf32>, %arg2: memref<3x7xf32>) {
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d0, d2)>,
+                       affine_map<(d0, d1, d2) -> (d1, d2)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                      ]
+                      ins(%arg0, %arg1 : memref<3x5xf32>, memref<7x5xf32>)
+                      outs(%arg2: memref<3x7xf32>)
+                      
+  return
+}
+
+// -----
+
+// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2, d0)>
+// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
+// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+
+// CHECK-LABEL:   func.func @matmul_transpose_a_b_explicit(
+// CHECK-SAME:                                             %[[VAL_0:.*]]: memref<5x3xf32>,
+// CHECK-SAME:                                             %[[VAL_1:.*]]: memref<7x5xf32>,
+// CHECK-SAME:                                             %[[VAL_2:.*]]: memref<3x7xf32>) {
+
+// CHECK:           linalg.generic {indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]], iterator_types = ["parallel", "parallel", "reduction"]}
+// CHECK:           arith.mulf
+// CHECK:           arith.addf
+
+func.func @matmul_transpose_a_b_explicit(%arg0: memref<5x3xf32>, %arg1: memref<7x5xf32>, %arg2: memref<3x7xf32>) {
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d2, d0)>,
+                       affine_map<(d0, d1, d2) -> (d1, d2)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                      ]
+                      ins(%arg0, %arg1 : memref<5x3xf32>, memref<7x5xf32>)
+                      outs(%arg2: memref<3x7xf32>)
+                      
+  return
+}
+
+// -----
+
diff --git a/mlir/test/Dialect/Linalg/invalid.mlir b/mlir/test/Dialect/Linalg/invalid.mlir
index c481a723c5623c..b2869893b8042d 100644
--- a/mlir/test/Dialect/Linalg/invalid.mlir
+++ b/mlir/test/Dialect/Linalg/invalid.mlir
@@ -361,6 +361,165 @@ func.func @invalid_static_matmul(%arg0: memref<2x4xf32>, %arg1: memref<3x4xf32>,
 
 // -----
 
+func.func @invalid_indexing_maps_matmul(%arg0: memref<2x4xf32>, %arg1: memref<3x4xf32>, %arg2: memref<2x4xf32>) {
+  // expected-error @+1 {{expected attribute value}}
+  linalg.matmul indexing_maps = [
+                       ,
+                       affine_map<(d0, d1, d2) -> (d2, d1)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                      ]
+                      ins(%arg0, %arg1 : memref<2x4xf32>, memref<3x4xf32>)
+                      outs(%arg2 :memref<2x4xf32>)
+  return
+}
+
+// -----
+
+func.func @invalid_matmul_dim_a(%arg0: memref<5x5xf32>, %arg1: memref<5x5xf32>, %arg2: memref<5x5xf32>) {
+  // expected-error @+1 {{Unexpected dim expression in map result}}
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d1, d2)>,
+                       affine_map<(d0, d1, d2) -> (d2, d1)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                     ]
+                     ins(%arg0, %arg1 : memref<5x5xf32>, memref<5x5xf32>) outs(%arg2: memref<5x5xf32>)
+  return
+}
+
+// -----
+
+func.func @invalid_matmul_dim_b(%arg0: memref<5x5xf32>, %arg1: memref<5x5xf32>, %arg2: memref<5x5xf32>) {
+  // expected-error @+1 {{Unexpected dim expression in map result}}
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d0, d2)>,
+                       affine_map<(d0, d1, d2) -> (d2, d0)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                     ]
+                     ins(%arg0, %arg1 : memref<5x5xf32>, memref<5x5xf32>) outs(%arg2: memref<5x5xf32>)
+  return
+}
+
+// -----
+
+func.func @invalid_transpose_a_matmul(%lhs: tensor<4x1xf32>, %rhs: tensor<1x64xf32>, %init: tensor<4x64xf32>) -> tensor<4x64xf32> {
+  // expected-error @+1 {{inferred input/output operand #1 has shape's dimension #0 to be 4, but found 1}}
+  %0 = linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d2, d0)>,
+                       affine_map<(d0, d1, d2) -> (d2, d1)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                      ]
+                      ins(%lhs, %rhs : tensor<4x1xf32>, tensor<1x64xf32>)
+                      outs(%init : tensor<4x64xf32>) -> tensor<4x64xf32>
+  return %0: tensor<4x64xf32>
+}
+
+// -----
+
+func.func @invalid_transpose_b_matmul(%lhs: tensor<4x1xf32>, %rhs: tensor<1x64xf32>, %init: tensor<4x64xf32>) -> tensor<4x64xf32> {
+  // expected-error @+1 {{inferred input/output operand #1 has shape's dimension #1 to be 1, but found 64}}
+  %0 = linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d0, d2)>,
+                       affine_map<(d0, d1, d2) -> (d1, d2)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                      ]
+                      ins(%lhs, %rhs : tensor<4x1xf32>, tensor<1x64xf32>)
+                      outs(%init : tensor<4x64xf32>) -> tensor<4x64xf32>
+  return %0: tensor<4x64xf32>
+}
+
+// -----
+
+func.func @invalid_bcast_a(%arg0: memref<3xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) {
+  // expected-error @+1 {{'linalg.matmul' op Invalid broadcast requested, should be (d2)}}
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d0)>,
+                       affine_map<(d0, d1, d2) -> (d1, d2)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                     ]
+                     ins(%arg0, %arg1 : memref<3xf32>, memref<5x7xf32>) outs(%arg2: memref<3x7xf32>)
+  return
+}
+
+// -----
+
+func.func @invalid_bcast_b(%arg0: memref<3x5xf32>, %arg1: memref<7xf32>, %arg2: memref<3x7xf32>) {
+  // expected-error @+1 {{'linalg.matmul' op Invalid broadcast requested, should be (d2)}}
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d0, d2)>,
+                       affine_map<(d0, d1, d2) -> (d1)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                     ]
+                     ins(%arg0, %arg1 : memref<3x5xf32>, memref<7xf32>) outs(%arg2: memref<3x7xf32>)
+  return
+}
+
+// -----
+
+func.func @invalid_bcast_a_rank_mismatch(%arg0: memref<3x5xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) {
+  // expected-error @+1 {{'linalg.matmul' op expected operand rank (2) to match the result rank of indexing_map #0 (1)}}
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d2)>,
+                       affine_map<(d0, d1, d2) -> (d2, d1)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                     ]
+                     ins(%arg0, %arg1 : memref<3x5xf32>, memref<5x7xf32>) outs(%arg2: memref<3x7xf32>)
+  return
+}
+
+// -----
+
+func.func @invalid_bcast_b_rank_mismatch(%arg0: memref<3x5xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) {
+  // expected-error @+1 {{'linalg.matmul' op expected operand rank (2) to match the result rank of indexing_map #1 (1)}}
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d0, d2)>,
+                       affine_map<(d0, d1, d2) -> (d2)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                     ]
+                     ins(%arg0, %arg1 : memref<3x5xf32>, memref<5x7xf32>) outs(%arg2: memref<3x7xf32>)
+  return
+}
+
+// -----
+
+func.func @invalid_matmul_bcast_b_transpose_a(%arg0: memref<5x3xf32>, %arg1: memref<7xf32>, %arg2: memref<3x7xf32>) {
+  // expected-error @+1 {{inferred input/output operand #1 has shape's dimension #0 to be 5, but found 7}}
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d2, d0)>,
+                       affine_map<(d0, d1, d2) -> (d2)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                     ]
+                     ins(%arg0, %arg1 : memref<5x3xf32>, memref<7xf32>) outs(%arg2: memref<3x7xf32>)
+  return
+}
+
+// -----
+
+func.func @invalid_matmul_bcast_b_transpose_a_wrong_dim(%arg0: memref<3x5xf32>, %arg1: memref<5xf32>, %arg2: memref<3x7xf32>) {
+  // expected-error @+1 {{'linalg.matmul' op Unexpected dim expression in map result.}}
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d1, d2)>,
+                       affine_map<(d0, d1, d2) -> (d2)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                     ]
+                     ins(%arg0, %arg1 : memref<3x5xf32>, memref<5xf32>) outs(%arg2: memref<3x7xf32>)
+  return
+}
+
+// -----
+
+func.func @invalid_indexing_maps_placement_matmul(%lhs: tensor<4x1xf32>, %rhs: tensor<1x64xf32>, %init: tensor<4x64xf32>) {
+  // expected-error @+2 {{custom op 'indexing_maps' is unknown (tried 'func.indexing_maps' as well)}}
+  linalg.matmul ins(%lhs, %rhs : tensor<4x1xf32>, tensor<1x64xf32>) outs(%init : tensor<4x64xf32>)
+                        indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d0, d2)>,
+                       affine_map<(d0, d1, d2) -> (d2, d1)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                      ]
+  return
+}
+
+// -----
+
 func.func @invalid_static_2d_conv(%input : memref<1x3x4x2xf32>, %filter: memref<3x2x2x1xf32>, %output: memref<1x2x3x1xf32>) {
   // expected-error @+1 {{inferred input/output operand #0 has shape's dimension #1 to be greater than or equal to 4, but found 3}}
   linalg.conv_2d_nhwc_hwcf
diff --git a/mlir/test/Dialect/Linalg/named-ops.mlir b/mlir/test/Dialect/Linalg/named-ops.mlir
index 02ecbed232c8b5..65c18de8424771 100644
--- a/mlir/test/Dialect/Linalg/named-ops.mlir
+++ b/mlir/test/Dialect/Linalg/named-ops.mlir
@@ -1201,6 +1201,249 @@ func.func @matmul_transpose_a(%arg0: memref<5x3xf32>, %arg1: memref<5x7xf32>, %a
 
 // -----
 
+// CHECK-LABEL: func @matmul_transpose_a_explicit
+//       CHECK:   linalg.matmul
+//  CHECK-SAME:     ins(%{{.+}}, %{{.+}} : memref<5x3xf32>, memref<5x7xf32>)
+//  CHECK-SAME:     outs(%{{.+}} : memref<3x7xf32>)
+func.func @matmul_transpose_a_explicit(%arg0: memref<5x3xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) {
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d2, d0)>,
+                       affine_map<(d0, d1, d2) -> (d2, d1)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                      ]
+                      ins(%arg0, %arg1 : memref<5x3xf32>, memref<5x7xf32>)
+                      outs(%arg2: memref<3x7xf32>)
+                      
+  return
+}
+
+// -----
+
+func.func @matmul_transpose_b_explicit(%arg0: memref<3x5xf32>, %arg1: memref<7x5xf32>, %arg2: memref<3x7xf32>) {
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d0, d2)>,
+                       affine_map<(d0, d1, d2) -> (d1, d2)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                      ]
+                      ins(%arg0, %arg1 : memref<3x5xf32>, memref<7x5xf32>)
+                      outs(%arg2: memref<3x7xf32>)
+                      
+  return
+}
+
+// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
+// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+
+// CHECK-LABEL:   func.func @matmul_transpose_b_explicit(
+// CHECK-SAME:                                           %[[VAL_0:.*]]: memref<3x5xf32>,
+// CHECK-SAME:                                           %[[VAL_1:.*]]: memref<7x5xf32>,
+// CHECK-SAME:                                           %[[VAL_2:.*]]: memref<3x7xf32>) {
+// CHECK:           linalg.matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<3x5xf32>, memref<7x5xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]]
+// CHECK:           return
+// CHECK:         }
+
+// -----
+
+func.func @matmul_transpose_a_b_explicit(%arg0: memref<5x3xf32>, %arg1: memref<7x5xf32>, %arg2: memref<3x7xf32>) {
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d2, d0)>,
+                       affine_map<(d0, d1, d2) -> (d1, d2)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                      ]
+                      ins(%arg0, %arg1 : memref<5x3xf32>, memref<7x5xf32>)
+                      outs(%arg2: memref<3x7xf32>)
+  return
+}
+
+// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2, d0)>
+// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
+// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+
+// CHECK-LABEL:   func.func @matmul_transpose_a_b_explicit(
+// CHECK-SAME:                                             %[[VAL_0:.*]]: memref<5x3xf32>,
+// CHECK-SAME:                                             %[[VAL_1:.*]]: memref<7x5xf32>,
+// CHECK-SAME:                                             %[[VAL_2:.*]]: memref<3x7xf32>) {
+// CHECK:           linalg.matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<5x3xf32>, memref<7x5xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]]
+// CHECK:           return
+// CHECK:         }
+
+// -----
+
+func.func @matmul_bcast_a(%arg0: memref<5xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) {
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d2)>,
+                       affine_map<(d0, d1, d2) -> (d2, d1)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                     ]
+                     ins(%arg0, %arg1 : memref<5xf32>, memref<5x7xf32>) outs(%arg2: memref<3x7xf32>)
+  return
+}
+
+// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2)>
+// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
+// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+// CHECK-LABEL: func @matmul_bcast_a
+//       CHECK:   linalg.matmul
+//  CHECK-SAME:     ins(%{{.+}}, %{{.+}} : memref<5xf32>, memref<5x7xf32>)
+//  CHECK-SAME:     outs(%{{.+}} : memref<3x7xf32>)
+
+// -----
+
+func.func @matmul_bcast_a_dim1(%arg0: memref<5xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) {
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d2)>,
+                       affine_map<(d0, d1, d2) -> (d2, d1)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                     ]
+                     ins(%arg0, %arg1 : memref<5xf32>, memref<5x7xf32>) outs(%arg2: memref<3x7xf32>)
+  return
+}
+
+// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2)>
+// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
+// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+// CHECK-LABEL: func @matmul_bcast_a_dim1
+//       CHECK:   linalg.matmul
+//  CHECK-SAME:     ins(%{{.+}}, %{{.+}} : memref<5xf32>, memref<5x7xf32>)
+//  CHECK-SAME:     outs(%{{.+}} : memref<3x7xf32>)
+
+// -----
+
+func.func @matmul_bcast_b(%arg0: memref<3x5xf32>, %arg1: memref<5xf32>, %arg2: memref<3x7xf32>) {
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d0, d2)>,
+                       affine_map<(d0, d1, d2) -> (d2)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                     ]
+                     ins(%arg0, %arg1 : memref<3x5xf32>, memref<5xf32>) outs(%arg2: memref<3x7xf32>)
+  return
+}
+
+// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2)>
+// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+// CHECK-LABEL: func @matmul_bcast_b
+//       CHECK:   linalg.matmul
+//  CHECK-SAME:     ins(%{{.+}}, %{{.+}} : memref<3x5xf32>, memref<5xf32>)
+//  CHECK-SAME:     outs(%{{.+}} : memref<3x7xf32>)
+
+// -----
+
+func.func @matmul_bcast_a_b(%arg0: memref<5xf32>, %arg1: memref<5xf32>, %arg2: memref<3x7xf32>) {
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d2)>,
+                       affine_map<(d0, d1, d2) -> (d2)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                     ]
+                     ins(%arg0, %arg1 : memref<5xf32>, memref<5xf32>) outs(%arg2: memref<3x7xf32>)
+  return
+}
+
+// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2)>
+// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+
+// CHECK-LABEL:   func.func @matmul_bcast_a_b(
+// CHECK-SAME:                                %[[VAL_0:.*]]: memref<5xf32>, %[[VAL_1:.*]]: memref<5xf32>,
+// CHECK-SAME:                                %[[VAL_2:.*]]: memref<3x7xf32>) {
+// CHECK:           linalg.matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<5xf32>, memref<5xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_0]], #[[$ATTR_1]]]
+// CHECK:           return
+// CHECK:         }
+
+// -----
+
+func.func @matmul_bcast_b_dim1(%arg0: memref<3x5xf32>, %arg1: memref<5xf32>, %arg2: memref<3x7xf32>) {
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d0, d2)>,
+                       affine_map<(d0, d1, d2) -> (d2)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                     ]
+                     ins(%arg0, %arg1 : memref<3x5xf32>, memref<5xf32>) outs(%arg2: memref<3x7xf32>)
+  return
+}
+
+// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2)>
+// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+// CHECK-LABEL: func @matmul_bcast_b_dim1
+//       CHECK:   linalg.matmul
+//  CHECK-SAME:     ins(%{{.+}}, %{{.+}} : memref<3x5xf32>, memref<5xf32>)
+//  CHECK-SAME:     outs(%{{.+}} : memref<3x7xf32>)
+
+// -----
+
+func.func @dynamic_matmul_bcast_a(%arg0: memref<?xf32>, %arg1: memref<?x?xf32>, %arg2: memref<?x?xf32>) {
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d2)>,
+                       affine_map<(d0, d1, d2) -> (d2, d1)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                     ]
+                     ins(%arg0, %arg1 : memref<?xf32>, memref<?x?xf32>) outs(%arg2: memref<?x?xf32>)
+  return
+}
+
+// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2)>
+// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
+// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+
+// CHECK-LABEL:   func.func @dynamic_matmul_bcast_a(
+// CHECK-SAME:                                      %[[VAL_0:.*]]: memref<?xf32>,
+// CHECK-SAME:                                      %[[VAL_1:.*]]: memref<?x?xf32>,
+// CHECK-SAME:                                      %[[VAL_2:.*]]: memref<?x?xf32>) {
+// CHECK:           linalg.matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<?xf32>, memref<?x?xf32>) outs(%[[VAL_2]] : memref<?x?xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]]
+// CHECK:           return
+// CHECK:         }
+
+// -----
+
+func.func @matmul_bcast_a_transpose_b(%arg0: memref<5xf32>, %arg1: memref<7x5xf32>, %arg2: memref<3x7xf32>) {
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d2)>,
+                       affine_map<(d0, d1, d2) -> (d1, d2)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                     ]
+                     ins(%arg0, %arg1 : memref<5xf32>, memref<7x5xf32>) outs(%arg2: memref<3x7xf32>)
+  return
+}
+
+// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2)>
+// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
+// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+
+// CHECK-LABEL:   func.func @matmul_bcast_a_transpose_b(
+// CHECK-SAME:                                  %[[VAL_0:.*]]: memref<5xf32>,
+// CHECK-SAME:                                  %[[VAL_1:.*]]: memref<7x5xf32>,
+// CHECK-SAME:                                  %[[VAL_2:.*]]: memref<3x7xf32>) {
+// CHECK:           linalg.matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<5xf32>, memref<7x5xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]]
+// CHECK:           return
+// CHECK:         }
+
+// -----
+
+func.func @matmul_bcast_b_transpose_a(%arg0: memref<5x3xf32>, %arg1: memref<5xf32>, %arg2: memref<3x7xf32>) {
+  linalg.matmul indexing_maps = [
+                       affine_map<(d0, d1, d2) -> (d2, d0)>,
+                       affine_map<(d0, d1, d2) -> (d2)>,
+                       affine_map<(d0, d1, d2) -> (d0, d1)>
+                     ]
+                     ins(%arg0, %arg1 : memref<5x3xf32>, memref<5xf32>) outs(%arg2: memref<3x7xf32>)
+  return
+}
+
+// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2, d0)>
+// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2)>
+// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+
+// CHECK-LABEL:   func.func @matmul_bcast_b_transpose_a(
+// CHECK-SAME:                                          %[[VAL_0:.*]]: memref<5x3xf32>,
+// CHECK-SAME:                                          %[[VAL_1:.*]]: memref<5xf32>,
+// CHECK-SAME:                                          %[[VAL_2:.*]]: memref<3x7xf32>) {
+// CHECK:           linalg.matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<5x3xf32>, memref<5xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]]
+// CHECK:           return
+// CHECK:         }
+
+// -----
+
 // CHECK-LABEL: func @matmul_transpose_b
 //       CHECK:   linalg.matmul_transpose_b
 //  CHECK-SAME:     ins(%{{.+}}, %{{.+}} : memref<3x5xf32>, memref<7x5xf32>)
diff --git a/mlir/test/python/dialects/linalg/ops.py b/mlir/test/python/dialects/linalg/ops.py
index 3bfbcf7d7f7c81..72045a07b2da80 100644
--- a/mlir/test/python/dialects/linalg/ops.py
+++ b/mlir/test/python/dialects/linalg/ops.py
@@ -84,81 +84,6 @@ def named_form(lhs, rhs):
 
     print(module)
 
-
-# CHECK-LABEL: TEST: testNamedStructuredOpGenericForm
-@run
-def testNamedStructuredOpGenericForm():
-    with Context() as ctx, Location.unknown():
-        module = Module.create()
-        f32 = F32Type.get()
-        with InsertionPoint(module.body):
-
-            @func.FuncOp.from_py_func(
-                RankedTensorType.get((4, 16), f32), RankedTensorType.get((16, 8), f32)
-            )
-            def named_form(lhs, rhs):
-                init_result = tensor.empty([4, 8], f32)
-                #      CHECK: "linalg.matmul"(%{{.*}})
-                # CHECK-SAME:    cast = #linalg.type_fn<cast_signed>
-                # CHECK-SAME:    operandSegmentSizes = array<i32: 2, 1>
-                # CHECK-NEXT:  ^bb0(%{{.*}}: f32, %{{.*}}: f32, %{{.*}}: f32):
-                # CHECK-NEXT:    arith.mulf{{.*}} (f32, f32) -> f32
-                # CHECK-NEXT:    arith.addf{{.*}} (f32, f32) -> f32
-                # CHECK-NEXT:    linalg.yield{{.*}} (f32) -> ()
-                # CHECK-NEXT: (tensor<4x16xf32>, tensor<16x8xf32>, tensor<4x8xf32>) -> tensor<4x8xf32>
-                return linalg.matmul(lhs, rhs, outs=[init_result])
-
-    module.operation.print(print_generic_op_form=True)
-
-
-# CHECK-LABEL: TEST: testNamedStructuredAsGenericOp
-@run
-def testNamedStructuredAsGenericOp():
-    with Context() as ctx, Location.unknown():
-        module = Module.create()
-        f32 = F32Type.get()
-        with InsertionPoint(module.body):
-
-            @func.FuncOp.from_py_func(
-                RankedTensorType.get((4, 16), f32), RankedTensorType.get((16, 8), f32)
-            )
-            def generic_form(lhs, rhs):
-                init_result = tensor.EmptyOp([4, 8], f32)
-                # CHECK: linalg.generic
-                return linalg.matmul(
-                    lhs, rhs, outs=[init_result.result], emit_generic=True
-                )
-
-    print(module)
-
-
-# CHECK-LABEL: TEST: testOpResultFromOtherOp
-@run
-def testOpResultFromOtherOp():
-    with Context(), Location.unknown():
-        module = Module.create()
-        f32 = F32Type.get()
-        with InsertionPoint(module.body):
-
-            @func.FuncOp.from_py_func(
-                RankedTensorType.get((4, 16), f32), RankedTensorType.get((16, 8), f32)
-            )
-            def pass_an_op_directly(arg0, arg1):
-                one = arith.ConstantOp(F32Type.get(), 1.0)
-                # CHECK: %[[LHS:.*]] = linalg.fill
-                lhs = linalg.fill(one, outs=[arg0])
-                # CHECK: %[[RHS:.*]] = linalg.fill
-                rhs = linalg.fill(one, outs=[arg1])
-                # CHECK: %[[INIT:.*]] = tensor.empty
-                init = tensor.EmptyOp([4, 8], f32)
-                # CHECK: linalg.matmul
-                # CHECK: ins(%[[LHS]], %[[RHS]]
-                # CHECK: outs(%[[INIT]]
-                return linalg.matmul(lhs, rhs, outs=init)
-
-    print(module)
-
-
 # CHECK-LABEL: TEST: testIdentityRegionOps
 @run
 def testIdentityRegionOps():
diff --git a/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp b/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp
index aa5a52a21f1251..f820cb7ee8c3c4 100644
--- a/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp
+++ b/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp
@@ -681,7 +681,11 @@ ParseResult {0}::parse(OpAsmParser &parser, OperationState &result) {{
     {0}::getNumRegionArgs(), {0}::getRegionBuilder());
 }
 void {0}::print(OpAsmPrinter &p) {{
-  ::printNamedStructuredOp(p, getOperation(), getInputs(), getOutputs());
+  SmallVector<StringRef, 3> elidedAttrs = {{"operandSegmentSizes",
+                                           "linalg.memoized_indexing_maps",
+                                           "indexing_maps"};
+  ::printNamedStructuredOp(p, getOperation(), getInputs(), getOutputs(),
+                           elidedAttrs);
 }
 )FMT";
 

From cb5fbd2f60a5a588bfa4668ea8269c3568cbff6e Mon Sep 17 00:00:00 2001
From: Ellis Hoag <ellis.sparky.hoag@gmail.com>
Date: Thu, 10 Oct 2024 09:01:50 -0700
Subject: [PATCH 038/177] [CodeLayout] Do not verify after assigning blocks
 (#111754)

Rather than invariantly running `F->verify()` when asserts are enabled,
run machine IR verification in LIT tests only.

Swap `CHECK-PERF` and `CHECK-SIZE` in `code_placement_ext_tsp_large.ll`.

Remove `={0,1,true,false}` from flags in tests.
---
 llvm/lib/CodeGen/MachineBlockPlacement.cpp    |  7 +---
 .../CodeGen/X86/code_placement_ext_tsp.ll     |  2 +-
 .../X86/code_placement_ext_tsp_large.ll       |  8 ++---
 .../X86/code_placement_ext_tsp_size.ll        | 34 +++++++++----------
 4 files changed, 23 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index c42e63202c3b5a..dd5220b4599f95 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -3572,7 +3572,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   if (UseExtTspForPerf || UseExtTspForSize) {
     assert(
         !(UseExtTspForPerf && UseExtTspForSize) &&
-        "UseExtTspForPerf and UseExtTspForSize can not be set simultaneosly");
+        "UseExtTspForPerf and UseExtTspForSize can not be set simultaneously");
     applyExtTsp(/*OptForSize=*/UseExtTspForSize);
     createCFGChainExtTsp();
   }
@@ -3745,11 +3745,6 @@ void MachineBlockPlacement::assignBlockOrder(
       continue;
     MBB.updateTerminator(FTMBB);
   }
-
-#ifndef NDEBUG
-  // Make sure we correctly constructed all branches.
-  F->verify(this, "After optimized block reordering", &errs());
-#endif
 }
 
 void MachineBlockPlacement::createCFGChainExtTsp() {
diff --git a/llvm/test/CodeGen/X86/code_placement_ext_tsp.ll b/llvm/test/CodeGen/X86/code_placement_ext_tsp.ll
index be0b9820e14541..37e3245467c869 100644
--- a/llvm/test/CodeGen/X86/code_placement_ext_tsp.ll
+++ b/llvm/test/CodeGen/X86/code_placement_ext_tsp.ll
@@ -1,5 +1,5 @@
 ;; See also llvm/unittests/Transforms/Utils/CodeLayoutTest.cpp
-; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -enable-ext-tsp-block-placement=1 < %s | FileCheck %s
+; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -verify-machineinstrs -enable-ext-tsp-block-placement < %s | FileCheck %s
 
 define void @func1a()  {
 ; Test that the algorithm positions the most likely successor first
diff --git a/llvm/test/CodeGen/X86/code_placement_ext_tsp_large.ll b/llvm/test/CodeGen/X86/code_placement_ext_tsp_large.ll
index ac172d32c6d8b6..24c52f1e88656e 100644
--- a/llvm/test/CodeGen/X86/code_placement_ext_tsp_large.ll
+++ b/llvm/test/CodeGen/X86/code_placement_ext_tsp_large.ll
@@ -1,8 +1,8 @@
 ; REQUIRES: asserts
-; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -enable-ext-tsp-block-placement=1 -ext-tsp-chain-split-threshold=128 -debug-only=block-placement < %s 2>&1 | FileCheck %s
-; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -enable-ext-tsp-block-placement=1 -ext-tsp-chain-split-threshold=1 -debug-only=block-placement < %s 2>&1 | FileCheck %s -check-prefix=CHECK2
-; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -enable-ext-tsp-block-placement=0 -debug-only=block-placement < %s 2>&1 | FileCheck %s -check-prefix=CHECK3
-; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -enable-ext-tsp-block-placement=1 -ext-tsp-block-placement-max-blocks=8 -debug-only=block-placement < %s 2>&1 | FileCheck %s -check-prefix=CHECK4
+; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -verify-machineinstrs -enable-ext-tsp-block-placement -ext-tsp-chain-split-threshold=128 -debug-only=block-placement < %s 2>&1 | FileCheck %s
+; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -verify-machineinstrs -enable-ext-tsp-block-placement -ext-tsp-chain-split-threshold=1 -debug-only=block-placement < %s 2>&1 | FileCheck %s -check-prefix=CHECK2
+; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -verify-machineinstrs -debug-only=block-placement < %s 2>&1 | FileCheck %s -check-prefix=CHECK3
+; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -verify-machineinstrs -enable-ext-tsp-block-placement -ext-tsp-block-placement-max-blocks=8 -debug-only=block-placement < %s 2>&1 | FileCheck %s -check-prefix=CHECK4
 
 @yydebug = dso_local global i32 0, align 4
 
diff --git a/llvm/test/CodeGen/X86/code_placement_ext_tsp_size.ll b/llvm/test/CodeGen/X86/code_placement_ext_tsp_size.ll
index 59eaf2586f1737..e7a4d6d8fd23a5 100644
--- a/llvm/test/CodeGen/X86/code_placement_ext_tsp_size.ll
+++ b/llvm/test/CodeGen/X86/code_placement_ext_tsp_size.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -apply-ext-tsp-for-size=true  < %s | FileCheck %s -check-prefix=CHECK-PERF
-; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -apply-ext-tsp-for-size=false < %s | FileCheck %s -check-prefix=CHECK-SIZE
+; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -verify-machineinstrs -apply-ext-tsp-for-size < %s | FileCheck %s -check-prefix=CHECK-SIZE
+; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -verify-machineinstrs < %s | FileCheck %s -check-prefix=CHECK-PERF
 
 define void @func1() minsize {
 ;
@@ -19,15 +19,15 @@ define void @func1() minsize {
 ; | b2  | <+
 ; +-----+
 ;
-; CHECK-PERF-LABEL: func1:
-; CHECK-PERF: %b0
-; CHECK-PERF: %b1
-; CHECK-PERF: %b2
-;
 ; CHECK-SIZE-LABEL: func1:
 ; CHECK-SIZE: %b0
-; CHECK-SIZE: %b2
 ; CHECK-SIZE: %b1
+; CHECK-SIZE: %b2
+;
+; CHECK-PERF-LABEL: func1:
+; CHECK-PERF: %b0
+; CHECK-PERF: %b2
+; CHECK-PERF: %b1
 
 b0:
   %call = call zeroext i1 @a()
@@ -75,21 +75,21 @@ define void @func_loop() minsize !prof !9 {
 ;                  |  end   |
 ;                  +--------+
 ;
-; CHECK-PERF-LABEL: func_loop:
-; CHECK-PERF: %entry
-; CHECK-PERF: %header
-; CHECK-PERF: %if.then
-; CHECK-PERF: %if.else
-; CHECK-PERF: %if.end
-; CHECK-PERF: %end
-;
 ; CHECK-SIZE-LABEL: func_loop:
 ; CHECK-SIZE: %entry
 ; CHECK-SIZE: %header
+; CHECK-SIZE: %if.then
 ; CHECK-SIZE: %if.else
 ; CHECK-SIZE: %if.end
-; CHECK-SIZE: %if.then
 ; CHECK-SIZE: %end
+;
+; CHECK-PERF-LABEL: func_loop:
+; CHECK-PERF: %entry
+; CHECK-PERF: %header
+; CHECK-PERF: %if.else
+; CHECK-PERF: %if.end
+; CHECK-PERF: %if.then
+; CHECK-PERF: %end
 
 entry:
   br label %header

From 25d9688c43d37c0c918e9b8ab2f67be35b0fb75f Mon Sep 17 00:00:00 2001
From: yronglin <yronglin777@gmail.com>
Date: Fri, 11 Oct 2024 00:04:02 +0800
Subject: [PATCH 039/177] [Clang] Extend lifetime of temporaries in
 mem-default-init for P2718R0 (#86960)

Depends on [CWG1815](https://github.com/llvm/llvm-project/pull/108039).
Fixes https://github.com/llvm/llvm-project/issues/85613.

In [[Clang] Implement P2718R0 "Lifetime extension in range-based for
loops"](https://github.com/llvm/llvm-project/pull/76361), we've not
implement the lifetime extensions for the temporaries which in
`CXXDefaultInitExpr`. As the confirmation in
https://github.com/llvm/llvm-project/issues/85613, we should extend
lifetime for that.

To avoid modifying current CodeGen rules, in a lifetime extension
context, the cleanup of `CXXDefaultInitExpr` was ignored.

---------

Signed-off-by: yronglin <yronglin777@gmail.com>
---
 clang/docs/ReleaseNotes.rst                   |   3 +
 clang/lib/Sema/SemaExpr.cpp                   |   2 +
 clang/lib/Sema/SemaInit.cpp                   |   2 +
 .../test/AST/ast-dump-for-range-lifetime.cpp  |  59 +++++++++
 clang/test/CXX/special/class.temporary/p6.cpp | 122 +++++++++++++++++-
 clang/www/cxx_status.html                     |   9 +-
 6 files changed, 188 insertions(+), 9 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index c0019cfe4658d7..e48835d4738007 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -171,6 +171,9 @@ C++23 Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
 - Removed the restriction to literal types in constexpr functions in C++23 mode.
 
+- Extend lifetime of temporaries in mem-default-init for P2718R0. Clang now fully
+  supported `P2718R0 Lifetime extension in range-based for loops <https://wg21.link/P2718R0>`_.
+
 C++20 Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index e2141e03ca4230..4e37385710af5e 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -5649,6 +5649,8 @@ ExprResult Sema::BuildCXXDefaultInitExpr(SourceLocation Loc, FieldDecl *Field) {
       runWithSufficientStackSpace(Loc, [&] {
         MarkDeclarationsReferencedInExpr(E, /*SkipLocalVariables=*/false);
       });
+    if (isInLifetimeExtendingContext())
+      DiscardCleanupsInEvaluationContext();
     // C++11 [class.base.init]p7:
     //   The initialization of each base and member constitutes a
     //   full-expression.
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index edd1fe40fdf278..5d6a586fe5a2cf 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -763,6 +763,8 @@ void InitListChecker::FillInEmptyInitForField(unsigned Init, FieldDecl *Field,
         SemaRef.currentEvaluationContext().DelayedDefaultInitializationContext =
             SemaRef.parentEvaluationContext()
                 .DelayedDefaultInitializationContext;
+        SemaRef.currentEvaluationContext().InLifetimeExtendingContext =
+            SemaRef.parentEvaluationContext().InLifetimeExtendingContext;
         DIE = SemaRef.BuildCXXDefaultInitExpr(Loc, Field);
       }
       if (DIE.isInvalid()) {
diff --git a/clang/test/AST/ast-dump-for-range-lifetime.cpp b/clang/test/AST/ast-dump-for-range-lifetime.cpp
index 0e92b6990ed504..ee046be19ab632 100644
--- a/clang/test/AST/ast-dump-for-range-lifetime.cpp
+++ b/clang/test/AST/ast-dump-for-range-lifetime.cpp
@@ -449,4 +449,63 @@ void test13() {
   for (auto e : dg<A>().r().g().r().g().r().g())
     bar(e);
 }
+
+extern "C" void exit(int);
+
+struct A14 {
+  int arr[1];
+  ~A14() noexcept(false) { throw 42; }
+};
+
+struct B14 {
+  int x;
+  const A14 &a = A14{{0}};
+  const int *begin() { return a.arr; }
+  const int *end() { return &a.arr[1]; }
+};
+
+void test14() {
+  // The ExprWithCleanups in CXXDefaultInitExpr will be ignored.
+
+  // CHECK: FunctionDecl {{.*}} test14 'void ()'
+  // CHECK:      -CXXForRangeStmt {{.*}}
+  // CHECK-NEXT:  |-<<<NULL>>>
+  // CHECK-NEXT:  |-DeclStmt {{.*}}
+  // CHECK-NEXT:  | `-VarDecl {{.*}} implicit used __range1 'const int (&)[1]' cinit
+  // CHECK-NEXT:  |   `-ExprWithCleanups {{.*}} 'const int[1]' lvalue
+  // CHECK-NEXT:  |     `-MemberExpr {{.*}} 'const int[1]' lvalue .arr {{.*}}
+  // CHECK-NEXT:  |       `-MemberExpr {{.*}} 'const A14':'const P2718R0::A14' lvalue .a {{.*}}
+  // CHECK-NEXT:  |         `-MaterializeTemporaryExpr {{.*}} 'B14':'P2718R0::B14' xvalue extended by Var {{.*}} '__range1' 'const int (&)[1]'
+  // CHECK-NEXT:  |           `-CXXFunctionalCastExpr {{.*}} 'B14':'P2718R0::B14' functional cast to B14 <NoOp>
+  // CHECK-NEXT:  |             `-InitListExpr {{.*}} 'B14':'P2718R0::B14'
+  // CHECK-NEXT:  |               |-IntegerLiteral {{.*}} 'int' 0
+  // CHECK-NEXT:  |               `-CXXDefaultInitExpr {{.*}} 'const A14':'const P2718R0::A14' lvalue has rewritten init
+  // CHECK-NEXT:  |                 `-MaterializeTemporaryExpr {{.*}} 'const A14':'const P2718R0::A14' lvalue extended by Var {{.*}} '__range1' 'const int (&)[1]'
+  // CHECK-NEXT:  |                   `-ImplicitCastExpr {{.*}} 'const A14':'const P2718R0::A14' <NoOp>
+  // CHECK-NEXT:  |                     `-CXXFunctionalCastExpr {{.*}} 'A14':'P2718R0::A14' functional cast to A14 <NoOp>
+  // CHECK-NEXT:  |                       `-CXXBindTemporaryExpr {{.*}} 'A14':'P2718R0::A14' (CXXTemporary {{.*}})
+  // CHECK-NEXT:  |                         `-InitListExpr {{.*}} 'A14':'P2718R0::A14'
+  // CHECK-NEXT:  |                           `-InitListExpr {{.*}} 'int[1]'
+  // CHECK-NEXT:  |                             `-IntegerLiteral {{.*}} 'int' 0
+  for (auto &&x : B14{0}.a.arr) { exit(0); }
+
+  // CHECK:     -CXXForRangeStmt {{.*}}
+  // CHECK-NEXT: |-<<<NULL>>>
+  // CHECK-NEXT: |-DeclStmt {{.*}}
+  // CHECK-NEXT: | `-VarDecl {{.*}} col:19 implicit used __range1 'B14 &&' cinit
+  // CHECK-NEXT: |   `-ExprWithCleanups {{.*}} 'B14':'P2718R0::B14' xvalue
+  // CHECK-NEXT: |     `-MaterializeTemporaryExpr {{.*}} 'B14':'P2718R0::B14' xvalue extended by Var {{.*}} '__range1' 'B14 &&'
+  // CHECK-NEXT: |       `-CXXFunctionalCastExpr {{.*}} 'B14':'P2718R0::B14' functional cast to B14 <NoOp>
+  // CHECK-NEXT: |         `-InitListExpr {{.*}} 'B14':'P2718R0::B14'
+  // CHECK-NEXT: |           |-IntegerLiteral {{.*}} 'int' 0
+  // CHECK-NEXT: |           `-CXXDefaultInitExpr {{.*}} 'const A14':'const P2718R0::A14' lvalue has rewritten init
+  // CHECK-NEXT: |             `-MaterializeTemporaryExpr {{.*}} 'const A14':'const P2718R0::A14' lvalue extended by Var {{.*}} '__range1' 'B14 &&'
+  // CHECK-NEXT: |               `-ImplicitCastExpr {{.*}} 'const A14':'const P2718R0::A14' <NoOp>
+  // CHECK-NEXT: |                 `-CXXFunctionalCastExpr {{.*}} 'A14':'P2718R0::A14' functional cast to A14 <NoOp>
+  // CHECK-NEXT: |                   `-CXXBindTemporaryExpr {{.*}} 'A14':'P2718R0::A14' (CXXTemporary {{.*}})
+  // CHECK-NEXT: |                     `-InitListExpr {{.*}} 'A14':'P2718R0::A14'
+  // CHECK-NEXT: |                       `-InitListExpr {{.*}} 'int[1]'
+  // CHECK-NEXT: |                         `-IntegerLiteral {{.*}} 'int' 0
+  for (auto &&x : B14{0}) { exit(0); }
+}
 } // namespace P2718R0
diff --git a/clang/test/CXX/special/class.temporary/p6.cpp b/clang/test/CXX/special/class.temporary/p6.cpp
index a6d2adfd1fd2c5..2b1b531b7172ca 100644
--- a/clang/test/CXX/special/class.temporary/p6.cpp
+++ b/clang/test/CXX/special/class.temporary/p6.cpp
@@ -463,6 +463,80 @@ template void default_arg_dependent_context2<int>();
 template void default_arg_dependent_context3<int>();
 } // namespace default_arg
 
+namespace default_init {
+template <class T>
+struct DepA {
+  T arr[1];
+  ~DepA() {}
+};
+
+template <class T>
+struct DepB {
+  int x;
+  const DepA<T> &a = DepA<T>{{0}};
+  ~DepB() {}
+  const int *begin() { return a.arr; }
+  const int *end() { return &a.arr[1]; }
+};
+
+template <typename T>
+void default_init1_dependent() {
+  // CHECK-CXX23: void @_ZN7P2718R012default_init23default_init1_dependentINS0_4DepBIiEEEEvv()
+  // CHECK-CXX23-LABEL: for.cond.cleanup:
+  // CHECK-CXX23-NEXT: call void @_ZN7P2718R012default_init4DepBIiED1Ev(
+  // CHECK-CXX23-NEXT: call void @_ZN7P2718R012default_init4DepAIiED1Ev(
+  for (auto &&x : T{0}) {}
+}
+
+template <typename T>
+void default_init2_dependent() {
+  // CHECK-CXX23: void @_ZN7P2718R012default_init23default_init2_dependentINS0_4DepBIiEEEEvv()
+  // CHECK-CXX23-LABEL: for.cond.cleanup:
+  // CHECK-CXX23-NEXT: call void @_ZN7P2718R012default_init4DepBIiED1Ev(
+  // CHECK-CXX23-NEXT: call void @_ZN7P2718R012default_init4DepAIiED1Ev(
+  for (auto &&x : T{0}.a.arr) {}
+}
+
+template void default_init1_dependent<DepB<int>>();
+template void default_init2_dependent<DepB<int>>();
+} // namespace default_init
+
+// -- Examples from https://wg21.link/p2718r0
+extern void block_scope_begin_function();
+extern void block_scope_end_function();
+namespace std_examples {
+using T = std::list<int>;
+const T& f1(const T& t) { return t; }
+const T& f2(T t)        { return t; }
+T g();
+void foo() {
+  // CHECK-CXX23: define {{.*}} void @_ZN7P2718R012std_examples3fooEv()
+  // CHECK-CXX23: call void @_ZN7P2718R026block_scope_begin_functionEv
+  block_scope_begin_function();
+  {
+    // CHECK-CXX23-NEXT: call void @_ZN7P2718R012std_examples1gEv
+    // CHECK-CXX23-NEXT: call {{.*}} @_ZN7P2718R012std_examples2f1ERKSt4listIiE
+    // CHECK-CXX23: for.cond.cleanup:
+    // CHECK-CXX23-NEXT: call void @_ZNSt4listIiED1Ev
+    for (auto e : f1(g())) {}  // OK, lifetime of return value of g() extended
+  }
+  // CHECK-CXX23: call void @_ZN7P2718R024block_scope_end_functionEv
+  block_scope_end_function();
+
+  // The lifetime of temporary returned by g() in this case will not be extended.
+  // CHECK-CXX23: call void @_ZN7P2718R026block_scope_begin_functionEv
+  block_scope_begin_function();
+  {
+    // CHECK-CXX23-NEXT: call void @_ZN7P2718R012std_examples1gEv
+    // CHECK-CXX23-NEXT: call {{.*}} @_ZN7P2718R012std_examples2f2ESt4listIiE
+    // CHECK-CXX23-NEXT: call void @_ZNSt4listIiED1Ev
+    for (auto e : f2(g())) {}  // undefined behavior
+  }
+  // CHECK-CXX23: call void @_ZN7P2718R024block_scope_end_functionEv
+  block_scope_end_function();
+}
+} // namespace std_examples
+
 namespace basic {
 using T = std::list<int>;
 const T& f1(const T& t) { return t; }
@@ -579,5 +653,51 @@ void default_arg3() {
   for (auto e : C(0, C(0, C(0, C())))) {}
 }
 } // namespace default_arg
-} // namespace P2718R0
 
+namespace default_init {
+struct X {
+  int x;
+  ~X() {}
+};
+
+struct Y {
+  int y;
+  const X &x = X{1};
+  ~Y() {}
+};
+
+struct A {
+  int arr[1];
+  const Y &y = Y{1};
+  ~A() {}
+};
+
+struct B {
+  int x;
+  const A &a = A{{0}};
+  ~B() {}
+  const int *begin() { return a.arr; }
+  const int *end() { return &a.arr[1]; }
+};
+
+void default_init1() {
+  // CHECK-CXX23: void @_ZN7P2718R012default_init13default_init1Ev()
+  // CHECK-CXX23-LABEL: for.cond.cleanup:
+  // CHECK-CXX23-NEXT: call void @_ZN7P2718R012default_init1BD1Ev(
+  // CHECK-CXX23-NEXT: call void @_ZN7P2718R012default_init1AD1Ev(
+  // CHECK-CXX23-NEXT: call void @_ZN7P2718R012default_init1YD1Ev(
+  // CHECK-CXX23-NEXT: call void @_ZN7P2718R012default_init1XD1Ev(
+  for (auto &&x : B{0}) {}
+}
+
+void default_init2() {
+  // CHECK-CXX23: void @_ZN7P2718R012default_init13default_init2Ev()
+  // CHECK-CXX23-LABEL: for.cond.cleanup:
+  // CHECK-CXX23-NEXT: call void @_ZN7P2718R012default_init1BD1Ev(
+  // CHECK-CXX23-NEXT: call void @_ZN7P2718R012default_init1AD1Ev(
+  // CHECK-CXX23-NEXT: call void @_ZN7P2718R012default_init1YD1Ev(
+  // CHECK-CXX23-NEXT: call void @_ZN7P2718R012default_init1XD1Ev(
+  for (auto &&x : B{0}.a.arr) {}
+}
+} // namespace default_init
+} // namespace P2718R0
diff --git a/clang/www/cxx_status.html b/clang/www/cxx_status.html
index 3f6a46c08c8514..d59cbbbbec1b5b 100755
--- a/clang/www/cxx_status.html
+++ b/clang/www/cxx_status.html
@@ -475,14 +475,7 @@ <h2 id="cxx23">C++23 implementation status</h2>
     <tr>
       <td>Lifetime extension in range-based for loops</td>
       <td><a href="https://wg21.link/P2718R0">P2718R0</a></td>
-      <td class="partial" align="center">
-        <details>
-          <summary>Clang 19 (Partial)</summary>
-            The lifetime extension of temporaries bound to member references
-            by default member initializers in aggregate initialization was
-            not supported now.
-        </details>
-      </td>
+      <td class="full" align="center">Clang 20</td>
     </tr>
     <!--Issaquah 2023 papers-->
     <tr>

From 2190ffa0f7e874d04fd0f750142135faa5df5d6b Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Thu, 10 Oct 2024 11:07:14 -0500
Subject: [PATCH 040/177] [libc] Fix missing namespace declarations

---
 libc/src/stdio/asprintf.h  | 4 ++--
 libc/src/stdio/vasprintf.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/libc/src/stdio/asprintf.h b/libc/src/stdio/asprintf.h
index 222dfdee9d4fd7..168721c4f98b98 100644
--- a/libc/src/stdio/asprintf.h
+++ b/libc/src/stdio/asprintf.h
@@ -11,10 +11,10 @@
 
 #include "src/__support/macros/config.h"
 
-namespace LIBC_NAMESPACE {
+namespace LIBC_NAMESPACE_DECL {
 
 int asprintf(char **__restrict s, const char *__restrict format, ...);
 
-} // namespace LIBC_NAMESPACE
+} // namespace LIBC_NAMESPACE_DECL
 
 #endif // LLVM_LIBC_SRC_STDIO_ASPRINTF_H
diff --git a/libc/src/stdio/vasprintf.h b/libc/src/stdio/vasprintf.h
index 8b286fe69bf203..b914c2f9ae0789 100644
--- a/libc/src/stdio/vasprintf.h
+++ b/libc/src/stdio/vasprintf.h
@@ -11,11 +11,11 @@
 
 #include <stdarg.h>
 
-namespace LIBC_NAMESPACE {
+namespace LIBC_NAMESPACE_DECL {
 
 int vasprintf(char **__restrict s, const char *__restrict format,
               va_list vlist);
 
-} // namespace LIBC_NAMESPACE
+} // namespace LIBC_NAMESPACE_DECL
 
 #endif // LLVM_LIBC_SRC_STDIO_VASPRINTF_H

From 73e74e496ec32a13a5ae71df71364065f7be3cca Mon Sep 17 00:00:00 2001
From: Eric Astor <epastor@google.com>
Date: Thu, 10 Oct 2024 12:21:34 -0400
Subject: [PATCH 041/177] [clang][frontend] Support applying the annotate
 attribute to statements (#111841)

By allowing AnnotateAttr to be applied to statements, users can place arbitrary information in the AST for later use.

For example, this can be used for HW-targeted language extensions that involve specialized loop annotations.
---
 clang/include/clang/AST/Attr.h                | 17 +++++++++
 clang/include/clang/Basic/Attr.td             |  7 +++-
 clang/include/clang/Sema/Sema.h               |  7 ++--
 clang/lib/Sema/Sema.cpp                       | 28 ++++++++++++++
 clang/lib/Sema/SemaDeclAttr.cpp               | 25 ++----------
 clang/lib/Sema/SemaStmtAttr.cpp               |  2 +
 clang/lib/Sema/SemaTemplateInstantiate.cpp    | 13 +++++++
 .../lib/Sema/SemaTemplateInstantiateDecl.cpp  |  5 ++-
 clang/test/AST/attr-print-emit.cpp            |  3 ++
 clang/test/Sema/annotate.c                    |  3 ++
 clang/test/SemaTemplate/attributes.cpp        | 38 +++++++++++++++++++
 clang/utils/TableGen/ClangAttrEmitter.cpp     | 30 ++++++++-------
 12 files changed, 137 insertions(+), 41 deletions(-)

diff --git a/clang/include/clang/AST/Attr.h b/clang/include/clang/AST/Attr.h
index ac44e9fdd7c4e9..725498e132fc28 100644
--- a/clang/include/clang/AST/Attr.h
+++ b/clang/include/clang/AST/Attr.h
@@ -197,6 +197,23 @@ class InheritableParamAttr : public InheritableAttr {
   }
 };
 
+class InheritableParamOrStmtAttr : public InheritableParamAttr {
+protected:
+  InheritableParamOrStmtAttr(ASTContext &Context,
+                             const AttributeCommonInfo &CommonInfo,
+                             attr::Kind AK, bool IsLateParsed,
+                             bool InheritEvenIfAlreadyPresent)
+      : InheritableParamAttr(Context, CommonInfo, AK, IsLateParsed,
+                             InheritEvenIfAlreadyPresent) {}
+
+public:
+  // Implement isa/cast/dyncast/etc.
+  static bool classof(const Attr *A) {
+    return A->getKind() >= attr::FirstInheritableParamOrStmtAttr &&
+           A->getKind() <= attr::LastInheritableParamOrStmtAttr;
+  }
+};
+
 class HLSLAnnotationAttr : public InheritableAttr {
 protected:
   HLSLAnnotationAttr(ASTContext &Context, const AttributeCommonInfo &CommonInfo,
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index fbcbf0ed416416..ec3d6e0079f630 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -759,6 +759,11 @@ class TargetSpecificAttr<TargetSpec target> {
 /// redeclarations, even when it's written on a parameter.
 class InheritableParamAttr : InheritableAttr;
 
+/// A attribute that is either a declaration attribute or a statement attribute,
+/// and if used as a declaration attribute, is inherited by later
+/// redeclarations, even when it's written on a parameter.
+class InheritableParamOrStmtAttr : InheritableParamAttr;
+
 /// An attribute which changes the ABI rules for a specific parameter.
 class ParameterABIAttr : InheritableParamAttr {
   let Subjects = SubjectList<[ParmVar]>;
@@ -928,7 +933,7 @@ def AnalyzerNoReturn : InheritableAttr {
   let Documentation = [Undocumented];
 }
 
-def Annotate : InheritableParamAttr {
+def Annotate : InheritableParamOrStmtAttr {
   let Spellings = [Clang<"annotate">];
   let Args = [StringArgument<"Annotation">, VariadicExprArgument<"Args">];
   // Ensure that the annotate attribute can be used with
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index ef010fafb1573e..f8118ca64ad3f2 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -4528,9 +4528,10 @@ class Sema final : public SemaBase {
   /// declaration.
   void AddAlignValueAttr(Decl *D, const AttributeCommonInfo &CI, Expr *E);
 
-  /// AddAnnotationAttr - Adds an annotation Annot with Args arguments to D.
-  void AddAnnotationAttr(Decl *D, const AttributeCommonInfo &CI,
-                         StringRef Annot, MutableArrayRef<Expr *> Args);
+  /// CreateAnnotationAttr - Creates an annotation Annot with Args arguments.
+  Attr *CreateAnnotationAttr(const AttributeCommonInfo &CI, StringRef Annot,
+                             MutableArrayRef<Expr *> Args);
+  Attr *CreateAnnotationAttr(const ParsedAttr &AL);
 
   bool checkMSInheritanceAttrOnDefinition(CXXRecordDecl *RD, SourceRange Range,
                                           bool BestCase,
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index f05760428458b1..9f91ee9a39f2f9 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -2777,3 +2777,31 @@ bool Sema::isDeclaratorFunctionLike(Declarator &D) {
   });
   return Result;
 }
+
+Attr *Sema::CreateAnnotationAttr(const AttributeCommonInfo &CI, StringRef Annot,
+                                 MutableArrayRef<Expr *> Args) {
+
+  auto *A = AnnotateAttr::Create(Context, Annot, Args.data(), Args.size(), CI);
+  if (!ConstantFoldAttrArgs(
+          CI, MutableArrayRef<Expr *>(A->args_begin(), A->args_end()))) {
+    return nullptr;
+  }
+  return A;
+}
+
+Attr *Sema::CreateAnnotationAttr(const ParsedAttr &AL) {
+  // Make sure that there is a string literal as the annotation's first
+  // argument.
+  StringRef Str;
+  if (!checkStringLiteralArgumentAttr(AL, 0, Str))
+    return nullptr;
+
+  llvm::SmallVector<Expr *, 4> Args;
+  Args.reserve(AL.getNumArgs() - 1);
+  for (unsigned Idx = 1; Idx < AL.getNumArgs(); Idx++) {
+    assert(!AL.isArgIdent(Idx));
+    Args.push_back(AL.getArgAsExpr(Idx));
+  }
+
+  return CreateAnnotationAttr(AL, Str, Args);
+}
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index e2174ba926f17f..6759aae37afac1 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -3958,30 +3958,11 @@ static void handleTransparentUnionAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   RD->addAttr(::new (S.Context) TransparentUnionAttr(S.Context, AL));
 }
 
-void Sema::AddAnnotationAttr(Decl *D, const AttributeCommonInfo &CI,
-                             StringRef Str, MutableArrayRef<Expr *> Args) {
-  auto *Attr = AnnotateAttr::Create(Context, Str, Args.data(), Args.size(), CI);
-  if (ConstantFoldAttrArgs(
-          CI, MutableArrayRef<Expr *>(Attr->args_begin(), Attr->args_end()))) {
-    D->addAttr(Attr);
-  }
-}
-
 static void handleAnnotateAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
-  // Make sure that there is a string literal as the annotation's first
-  // argument.
-  StringRef Str;
-  if (!S.checkStringLiteralArgumentAttr(AL, 0, Str))
-    return;
-
-  llvm::SmallVector<Expr *, 4> Args;
-  Args.reserve(AL.getNumArgs() - 1);
-  for (unsigned Idx = 1; Idx < AL.getNumArgs(); Idx++) {
-    assert(!AL.isArgIdent(Idx));
-    Args.push_back(AL.getArgAsExpr(Idx));
+  auto *Attr = S.CreateAnnotationAttr(AL);
+  if (Attr) {
+    D->addAttr(Attr);
   }
-
-  S.AddAnnotationAttr(D, AL, Str, Args);
 }
 
 static void handleAlignValueAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
diff --git a/clang/lib/Sema/SemaStmtAttr.cpp b/clang/lib/Sema/SemaStmtAttr.cpp
index b9b3b4063bc383..d81c6de3428dc7 100644
--- a/clang/lib/Sema/SemaStmtAttr.cpp
+++ b/clang/lib/Sema/SemaStmtAttr.cpp
@@ -679,6 +679,8 @@ static Attr *ProcessStmtAttribute(Sema &S, Stmt *St, const ParsedAttr &A,
     return handleMSConstexprAttr(S, St, A, Range);
   case ParsedAttr::AT_NoConvergent:
     return handleNoConvergentAttr(S, St, A, Range);
+  case ParsedAttr::AT_Annotate:
+    return S.CreateAnnotationAttr(A);
   default:
     // N.B., ClangAttrEmitter.cpp emits a diagnostic helper that ensures a
     // declaration attribute is not written on a statement, but this code is
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index 74252bd7513cd7..2f60c0beb22e73 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -1552,6 +1552,7 @@ namespace {
                           NamedDecl *FirstQualifierInScope = nullptr,
                           bool AllowInjectedClassName = false);
 
+    const AnnotateAttr *TransformAnnotateAttr(const AnnotateAttr *AA);
     const CXXAssumeAttr *TransformCXXAssumeAttr(const CXXAssumeAttr *AA);
     const LoopHintAttr *TransformLoopHintAttr(const LoopHintAttr *LH);
     const NoInlineAttr *TransformStmtNoInlineAttr(const Stmt *OrigS,
@@ -2182,6 +2183,18 @@ TemplateInstantiator::TransformTemplateParmRefExpr(DeclRefExpr *E,
                                          Arg, PackIndex);
 }
 
+const AnnotateAttr *
+TemplateInstantiator::TransformAnnotateAttr(const AnnotateAttr *AA) {
+  SmallVector<Expr *> Args;
+  for (Expr *Arg : AA->args()) {
+    ExprResult Res = getDerived().TransformExpr(Arg);
+    if (Res.isUsable())
+      Args.push_back(Res.get());
+  }
+  return AnnotateAttr::CreateImplicit(getSema().Context, AA->getAnnotation(),
+                                      Args.data(), Args.size(), AA->getRange());
+}
+
 const CXXAssumeAttr *
 TemplateInstantiator::TransformCXXAssumeAttr(const CXXAssumeAttr *AA) {
   ExprResult Res = getDerived().TransformExpr(AA->getAssumption());
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 34558e1a005d5a..6b1af35f5c80a8 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -230,7 +230,10 @@ static void instantiateDependentAnnotationAttr(
     ActualArgs.insert(ActualArgs.begin(), Args.begin() + 1, Args.end());
     std::swap(Args, ActualArgs);
   }
-  S.AddAnnotationAttr(New, *Attr, Str, Args);
+  auto *AA = S.CreateAnnotationAttr(*Attr, Str, Args);
+  if (AA) {
+    New->addAttr(AA);
+  }
 }
 
 static Expr *instantiateDependentFunctionAttrCondition(
diff --git a/clang/test/AST/attr-print-emit.cpp b/clang/test/AST/attr-print-emit.cpp
index d8e62ed5f6cd11..a9bca6778d0f1a 100644
--- a/clang/test/AST/attr-print-emit.cpp
+++ b/clang/test/AST/attr-print-emit.cpp
@@ -78,6 +78,9 @@ class C {
 ANNOTATE_ATTR int annotated_attr ANNOTATE_ATTR = 0;
 // CHECK: __attribute__((annotate("Annotated"))) int annotated_attr __attribute__((annotate("Annotated"))) = 0;
 
+void increment() { [[clang::annotate("Annotated")]] annotated_attr++; }
+// CHECK: {{\[\[}}clang::annotate("Annotated")]] annotated_attr++;
+
 // FIXME: We do not print the attribute as written after the type specifier.
 int ANNOTATE_ATTR annotated_attr_fixme = 0;
 // CHECK: __attribute__((annotate("Annotated"))) int annotated_attr_fixme = 0;
diff --git a/clang/test/Sema/annotate.c b/clang/test/Sema/annotate.c
index b4551a102e6174..f2ef08d6378975 100644
--- a/clang/test/Sema/annotate.c
+++ b/clang/test/Sema/annotate.c
@@ -3,10 +3,12 @@
 void __attribute__((annotate("foo"))) foo(float *a) {
   __attribute__((annotate("bar"))) int x;
   [[clang::annotate("bar")]] int x2;
+  [[clang::annotate("bar")]] x2 += 1;
   __attribute__((annotate(1))) int y; // expected-error {{expected string literal as argument of 'annotate' attribute}}
   [[clang::annotate(1)]] int y2; // expected-error {{expected string literal as argument of 'annotate' attribute}}
   __attribute__((annotate("bar", 1))) int z;
   [[clang::annotate("bar", 1)]] int z2;
+  [[clang::annotate("bar", 1)]] z2 += 1;
 
   int u = __builtin_annotation(z, (char*) 0); // expected-error {{second argument to __builtin_annotation must be a non-wide string constant}}
   int v = __builtin_annotation(z, (char*) L"bar"); // expected-error {{second argument to __builtin_annotation must be a non-wide string constant}}
@@ -15,4 +17,5 @@ void __attribute__((annotate("foo"))) foo(float *a) {
 
   __attribute__((annotate())) int c; // expected-error {{'annotate' attribute takes at least 1 argument}}
   [[clang::annotate()]] int c2;      // expected-error {{'annotate' attribute takes at least 1 argument}}
+  [[clang::annotate()]] c2 += 1;     // expected-error {{'annotate' attribute takes at least 1 argument}}
 }
diff --git a/clang/test/SemaTemplate/attributes.cpp b/clang/test/SemaTemplate/attributes.cpp
index f6c9f13f0842d2..dea19d09745ca2 100644
--- a/clang/test/SemaTemplate/attributes.cpp
+++ b/clang/test/SemaTemplate/attributes.cpp
@@ -65,6 +65,17 @@ namespace attribute_annotate {
 template<typename T> [[clang::annotate("ANNOTATE_FOO"), clang::annotate("ANNOTATE_BAR")]] void HasAnnotations();
 void UseAnnotations() { HasAnnotations<int>(); }
 
+// CHECK: FunctionTemplateDecl {{.*}} HasStmtAnnotations
+// CHECK:   AnnotateAttr {{.*}} "ANNOTATE_BAZ"
+// CHECK: FunctionDecl {{.*}} HasStmtAnnotations
+// CHECK:   TemplateArgument type 'int'
+// CHECK:   AnnotateAttr {{.*}} "ANNOTATE_BAZ"
+template<typename T> void HasStmtAnnotations() {
+  int x = 0;
+  [[clang::annotate("ANNOTATE_BAZ")]] x++;
+}
+void UseStmtAnnotations() { HasStmtAnnotations<int>(); }
+
 // CHECK:      FunctionTemplateDecl {{.*}} HasPackAnnotations
 // CHECK-NEXT:   NonTypeTemplateParmDecl {{.*}} referenced 'int' depth 0 index 0 ... Is
 // CHECK-NEXT:   FunctionDecl {{.*}} HasPackAnnotations 'void ()'
@@ -95,6 +106,33 @@ void UseAnnotations() { HasAnnotations<int>(); }
 template <int... Is> [[clang::annotate("ANNOTATE_BAZ", Is...)]] void HasPackAnnotations();
 void UsePackAnnotations() { HasPackAnnotations<1, 2, 3>(); }
 
+// CHECK:      FunctionTemplateDecl {{.*}} HasStmtPackAnnotations
+// CHECK-NEXT:   NonTypeTemplateParmDecl {{.*}} referenced 'int' depth 0 index 0 ... Is
+// CHECK-NEXT:   FunctionDecl {{.*}} HasStmtPackAnnotations 'void ()'
+// CHECK:          AttributedStmt {{.*}}
+// CHECK-NEXT:       AnnotateAttr {{.*}} "ANNOTATE_QUUX"
+// CHECK-NEXT:         PackExpansionExpr {{.*}} '<dependent type>'
+// CHECK-NEXT:           DeclRefExpr {{.*}} 'int' NonTypeTemplateParm {{.*}} 'Is' 'int'
+// CHECK:        FunctionDecl {{.*}} used HasStmtPackAnnotations 'void ()'
+// CHECK-NEXT:     TemplateArgument{{.*}} pack
+// CHECK-NEXT:       TemplateArgument{{.*}} integral '1'
+// CHECK-NEXT:       TemplateArgument{{.*}} integral '2'
+// CHECK-NEXT:       TemplateArgument{{.*}} integral '3'
+// CHECK:          AttributedStmt {{.*}}
+// CHECK-NEXT:       AnnotateAttr {{.*}} "ANNOTATE_QUUX"
+// CHECK-NEXT:         PackExpansionExpr {{.*}}
+// CHECK-NEXT:         SubstNonTypeTemplateParmPackExpr {{.*}}
+// CHECK-NEXT:         NonTypeTemplateParmDecl {{.*}} referenced 'int' depth 0 index 0 ... Is
+// CHECK-NEXT:           TemplateArgument pack '<1, 2, 3>'
+// CHECK-NEXT:             TemplateArgument integral '1'
+// CHECK-NEXT:             TemplateArgument integral '2'
+// CHECK-NEXT:             TemplateArgument integral '3'
+template <int... Is> void HasStmtPackAnnotations() {
+  int x = 0;
+  [[clang::annotate("ANNOTATE_QUUX", Is...)]] x++;
+}
+void UseStmtPackAnnotations() { HasStmtPackAnnotations<1, 2, 3>(); }
+
 template <int... Is> [[clang::annotate(Is...)]] void HasOnlyPackAnnotation() {} // expected-error {{expected string literal as argument of 'annotate' attribute}}
 
 void UseOnlyPackAnnotations() {
diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp
index 28b7ec8f822cf8..4890d249c6d8f7 100644
--- a/clang/utils/TableGen/ClangAttrEmitter.cpp
+++ b/clang/utils/TableGen/ClangAttrEmitter.cpp
@@ -3282,16 +3282,16 @@ namespace {
 } // end anonymous namespace
 
 static const AttrClassDescriptor AttrClassDescriptors[] = {
-  { "ATTR", "Attr" },
-  { "TYPE_ATTR", "TypeAttr" },
-  { "STMT_ATTR", "StmtAttr" },
-  { "DECL_OR_STMT_ATTR", "DeclOrStmtAttr" },
-  { "INHERITABLE_ATTR", "InheritableAttr" },
-  { "DECL_OR_TYPE_ATTR", "DeclOrTypeAttr" },
-  { "INHERITABLE_PARAM_ATTR", "InheritableParamAttr" },
-  { "PARAMETER_ABI_ATTR", "ParameterABIAttr" },
-  { "HLSL_ANNOTATION_ATTR", "HLSLAnnotationAttr"}
-};
+    {"ATTR", "Attr"},
+    {"TYPE_ATTR", "TypeAttr"},
+    {"STMT_ATTR", "StmtAttr"},
+    {"DECL_OR_STMT_ATTR", "DeclOrStmtAttr"},
+    {"INHERITABLE_ATTR", "InheritableAttr"},
+    {"DECL_OR_TYPE_ATTR", "DeclOrTypeAttr"},
+    {"INHERITABLE_PARAM_ATTR", "InheritableParamAttr"},
+    {"INHERITABLE_PARAM_OR_STMT_ATTR", "InheritableParamOrStmtAttr"},
+    {"PARAMETER_ABI_ATTR", "ParameterABIAttr"},
+    {"HLSL_ANNOTATION_ATTR", "HLSLAnnotationAttr"}};
 
 static void emitDefaultDefine(raw_ostream &OS, StringRef name,
                               const char *superName) {
@@ -4319,10 +4319,12 @@ static void GenerateMutualExclusionsChecks(const Record &Attr,
 
   // This means the attribute is either a statement attribute, a decl
   // attribute, or both; find out which.
-  bool CurAttrIsStmtAttr =
-      Attr.isSubClassOf("StmtAttr") || Attr.isSubClassOf("DeclOrStmtAttr");
-  bool CurAttrIsDeclAttr =
-      !CurAttrIsStmtAttr || Attr.isSubClassOf("DeclOrStmtAttr");
+  bool CurAttrIsStmtAttr = Attr.isSubClassOf("StmtAttr") ||
+                           Attr.isSubClassOf("DeclOrStmtAttr") ||
+                           Attr.isSubClassOf("InheritableParamOrStmtAttr");
+  bool CurAttrIsDeclAttr = !CurAttrIsStmtAttr ||
+                           Attr.isSubClassOf("DeclOrStmtAttr") ||
+                           Attr.isSubClassOf("InheritableParamOrStmtAttr");
 
   std::vector<std::string> DeclAttrs, StmtAttrs;
 

From c04b640a919de50342fca9e0afcbf4b710c7ea2f Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Thu, 10 Oct 2024 11:21:58 -0500
Subject: [PATCH 042/177] [libc] Add missing config include

---
 libc/src/stdio/vasprintf.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libc/src/stdio/vasprintf.h b/libc/src/stdio/vasprintf.h
index b914c2f9ae0789..7a98568edbc071 100644
--- a/libc/src/stdio/vasprintf.h
+++ b/libc/src/stdio/vasprintf.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDIO_VASPRINTF_H
 #define LLVM_LIBC_SRC_STDIO_VASPRINTF_H
 
+#include "src/__support/macros/config.h"
 #include <stdarg.h>
 
 namespace LIBC_NAMESPACE_DECL {

From cc9e7cb99b63559c5baf7e380287e5658c412370 Mon Sep 17 00:00:00 2001
From: TatWai Chong <78814694+tatwaichong@users.noreply.github.com>
Date: Thu, 10 Oct 2024 09:54:34 -0700
Subject: [PATCH 043/177] [mlir][tosa] Change the type of profile option to
 ListOption (#111214)

In tosa valiation pass, change the type of profile option to ListOption.
Now TOSA profiles is turned from hierarchical to composable. Each
profile is an independent set, i.e. an target can implement multiple
profiles.

Set the profile option to none by default, and limit to profiles if
requested.
The profiles can be specified via command line, e.g.
$ mlir-opt ... --tosa-validate="profile=bi,mi" which tells the valiation
pass that BI and MI are enabled.

Change-Id: I1fb8d0c1b27eccd768349b6eb4234093313efb57
---
 .../mlir/Conversion/TosaToLinalg/TosaToLinalg.h |  4 ++--
 .../mlir/Dialect/Tosa/Transforms/Passes.td      | 17 +++--------------
 .../TosaToLinalg/TosaToLinalgPass.cpp           |  2 +-
 .../Dialect/Tosa/Transforms/TosaValidation.cpp  | 16 +++++++++++++++-
 mlir/test/Dialect/Tosa/invalid.mlir             |  8 +++++++-
 mlir/test/Dialect/Tosa/level_check.mlir         |  6 +++++-
 6 files changed, 33 insertions(+), 20 deletions(-)

diff --git a/mlir/include/mlir/Conversion/TosaToLinalg/TosaToLinalg.h b/mlir/include/mlir/Conversion/TosaToLinalg/TosaToLinalg.h
index 192583f347b8a4..1822016fc88fe6 100644
--- a/mlir/include/mlir/Conversion/TosaToLinalg/TosaToLinalg.h
+++ b/mlir/include/mlir/Conversion/TosaToLinalg/TosaToLinalg.h
@@ -39,8 +39,8 @@ void addTosaToLinalgPasses(
         TosaToLinalgNamedOptions(),
     // Note: Default to 'none' level unless otherwise specified.
     std::optional<tosa::TosaValidationOptions> validationOptions =
-        tosa::TosaValidationOptions{tosa::TosaProfileEnum::Undefined, false,
-                                    tosa::TosaLevelEnum::None});
+        tosa::TosaValidationOptions{
+            {"none"}, false, tosa::TosaLevelEnum::None});
 
 /// Populates TOSA to linalg pipelines
 /// Currently, this includes only the "tosa-to-linalg-pipeline".
diff --git a/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td b/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td
index c0352fa88fe08d..dac67633769c76 100644
--- a/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td
@@ -76,7 +76,7 @@ def TosaProfileType : I32EnumAttr<"TosaProfileEnum", "Tosa profile",
       I32EnumAttrCase<"BaseInference", 0, "bi">,
       I32EnumAttrCase<"MainInference", 1, "mi">,
       I32EnumAttrCase<"MainTraining", 2, "mt">,
-      I32EnumAttrCase<"Undefined", 3>
+      I32EnumAttrCase<"Undefined", 3, "none">
     ]>{
   let cppNamespace = "mlir::tosa";
 }
@@ -97,19 +97,8 @@ def TosaValidation : Pass<"tosa-validate", "mlir::ModuleOp"> {
   }];
 
   let options = [
-      Option<"profile", "profile", "mlir::tosa::TosaProfileEnum",
-             /*default=*/"mlir::tosa::TosaProfileEnum::Undefined",
-             "Validate if operations match for the given profile",
-             [{::llvm::cl::values(
-               clEnumValN(mlir::tosa::TosaProfileEnum::BaseInference, "bi",
-                "Use Base Inference profile."),
-               clEnumValN(mlir::tosa::TosaProfileEnum::MainInference, "mi",
-                "Use Main Inference profile."),
-               clEnumValN(mlir::tosa::TosaProfileEnum::MainTraining, "mt",
-                "Use Main Training profile."),
-               clEnumValN(mlir::tosa::TosaProfileEnum::Undefined, "undefined",
-                "Do not define a profile.")
-              )}]>,
+      ListOption<"profile", "profile", "std::string",
+             "Validate if operations match for the given profile set">,
       Option<"StrictOperationSpecAlignment", "strict-op-spec-alignment", "bool",
              /*default=*/"false",
              "Verify if the properties of certain operations align the spec requirement">,
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp
index 44036d7c31a912..06a7262c467421 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp
@@ -115,7 +115,7 @@ void mlir::tosa::registerTosaToLinalgPipelines() {
         TosaToLinalgOptions tosaToLinalgOptions;
         TosaToLinalgNamedOptions tosaToLinalgNamedOptions;
         TosaValidationOptions validationOptions;
-        validationOptions.profile = tosa::TosaProfileEnum::BaseInference;
+        validationOptions.profile = {"none"};
         validationOptions.StrictOperationSpecAlignment = true;
         validationOptions.level = tosa::TosaLevelEnum::EightK;
         tosa::addTosaToLinalgPasses(pm, tosaToLinalgOptions,
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
index b78c372af77e64..e390a613b58077 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
@@ -405,14 +405,28 @@ struct TosaValidation : public tosa::impl::TosaValidationBase<TosaValidation> {
     if (level == TosaLevelEnum::EightK) {
       tosaLevel = TOSA_LEVEL_EIGHTK;
     }
+
+    if (!profile.empty()) {
+      for (std::string &prof : profile) {
+        auto profSymbol = symbolizeTosaProfileEnum(prof);
+        if (profSymbol) {
+          enabled_profiles.push_back(profSymbol.value());
+        }
+      }
+    }
   }
 
   bool CheckVariable(Operation *op);
   bool CheckVariableReadOrWrite(Operation *op);
 
   bool isValidElementType(Type type);
+  bool isEnabledProfile(TosaProfileEnum prof) {
+    return std::find(enabled_profiles.begin(), enabled_profiles.end(), prof) !=
+           std::end(enabled_profiles);
+  }
 
   SmallVector<std::function<LogicalResult(Operation *)>> constCheckers;
+  SmallVector<TosaProfileEnum, 3> enabled_profiles;
   TosaLevel tosaLevel;
   DenseMap<StringAttr, mlir::Type> variablesMap;
 };
@@ -507,7 +521,7 @@ LogicalResult TosaValidation::applyVariableCheck(Operation *op) {
 
 bool TosaValidation::isValidElementType(Type type) {
   if (isa<FloatType>(type)) {
-    if (profile == TosaProfileEnum::BaseInference)
+    if (!isEnabledProfile(TosaProfileEnum::MainInference))
       return false;
     return type.isF32() || type.isF16() || type.isBF16();
   }
diff --git a/mlir/test/Dialect/Tosa/invalid.mlir b/mlir/test/Dialect/Tosa/invalid.mlir
index e5c5b9b3663903..b9298b66643538 100644
--- a/mlir/test/Dialect/Tosa/invalid.mlir
+++ b/mlir/test/Dialect/Tosa/invalid.mlir
@@ -1,4 +1,10 @@
-// RUN: mlir-opt %s -split-input-file -verify-diagnostics --tosa-validate=strict-op-spec-alignment
+//--------------------------------------------------------------------------------------------------
+// Test expected errors in terms of the shape and type of tensor, and the argument type of
+// operation. Excludes the profile compilance checking since it is performed earlier in the
+// validation flow.
+//--------------------------------------------------------------------------------------------------
+
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics --tosa-validate="profile=bi,mi,mt strict-op-spec-alignment"
 
 
 func.func @test_const() -> tensor<1xf32> {
diff --git a/mlir/test/Dialect/Tosa/level_check.mlir b/mlir/test/Dialect/Tosa/level_check.mlir
index 9b652f2d0bd142..e851019362958f 100644
--- a/mlir/test/Dialect/Tosa/level_check.mlir
+++ b/mlir/test/Dialect/Tosa/level_check.mlir
@@ -1,4 +1,8 @@
-// RUN: mlir-opt %s -split-input-file -verify-diagnostics --tosa-validate
+//--------------------------------------------------------------------------------------------------
+// Enable all supported profiles to focus the verification of expected level errors.
+//--------------------------------------------------------------------------------------------------
+
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics --tosa-validate="profile=bi,mi,mt"
 
 
 func.func @test_argmax(%arg0: tensor<1x1x1x1x29x29x4xf32>) -> tensor<1x1x1x1x29x4xi32> {

From f2c5aa920054fa60372a161520e6ea8e8d23880d Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Thu, 10 Oct 2024 09:53:45 -0700
Subject: [PATCH 044/177] [lldb] Fix a variety of LLDB_LOG format strings

LLVM now triggers an assertion when the format string and arguments
don't match. Fix a variety of incorrect format strings I discovered when
enabling logging with a debug build.
---
 .../ExpressionParser/Clang/ClangExpressionDeclMap.cpp        | 4 ++--
 .../ObjC/AppleObjCRuntime/AppleObjCDeclVendor.cpp            | 2 +-
 lldb/source/Target/ScriptedThreadPlan.cpp                    | 5 +++--
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionDeclMap.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionDeclMap.cpp
index f994d025043352..5edaa9e4e053cc 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionDeclMap.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionDeclMap.cpp
@@ -934,7 +934,7 @@ void ClangExpressionDeclMap::LookUpLldbObjCClass(NameSearchContext &context) {
         QualType(interface_type, 0).getAsOpaquePtr(),
         function_decl_ctx.GetTypeSystem()->weak_from_this());
 
-    LLDB_LOG(log, "  FEVD[{0}] Adding type for $__lldb_objc_class: {1}",
+    LLDB_LOG(log, "  FEVD Adding type for $__lldb_objc_class: {0}",
              ClangUtil::ToString(interface_type));
 
     AddOneType(context, class_user_type);
@@ -974,7 +974,7 @@ void ClangExpressionDeclMap::LookUpLldbObjCClass(NameSearchContext &context) {
   if (!self_clang_type)
     return;
 
-  LLDB_LOG(log, "  FEVD[{0}] Adding type for $__lldb_objc_class: {1}",
+  LLDB_LOG(log, "  FEVD Adding type for $__lldb_objc_class: {0}",
            ClangUtil::ToString(self_type->GetFullCompilerType()));
 
   TypeFromUser class_user_type(self_clang_type);
diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCDeclVendor.cpp b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCDeclVendor.cpp
index f3a008ff1e8932..96a259b811b5e7 100644
--- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCDeclVendor.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCDeclVendor.cpp
@@ -605,7 +605,7 @@ uint32_t AppleObjCDeclVendor::FindDecls(ConstString name, bool append,
     if (log) {
       clang::QualType new_iface_type = ast_ctx.getObjCInterfaceType(iface_decl);
 
-      LLDB_LOG(log, "AOCTV::FT Created {1} (isa 0x{2:x})",
+      LLDB_LOG(log, "AOCTV::FT Created {0} (isa 0x{1:x})",
                new_iface_type.getAsString(), (uint64_t)isa);
     }
 
diff --git a/lldb/source/Target/ScriptedThreadPlan.cpp b/lldb/source/Target/ScriptedThreadPlan.cpp
index a8432f12258ee4..c4bdc8d080e350 100644
--- a/lldb/source/Target/ScriptedThreadPlan.cpp
+++ b/lldb/source/Target/ScriptedThreadPlan.cpp
@@ -184,8 +184,9 @@ void ScriptedThreadPlan::GetDescription(Stream *s,
       lldb::StreamSP stream = std::make_shared<lldb_private::StreamString>();
       llvm::Error err = m_interface->GetStopDescription(stream);
       if (err) {
-        LLDB_LOG_ERROR(GetLog(LLDBLog::Thread), std::move(err),
-                       "Can't call ScriptedThreadPlan::GetStopDescription.");
+        LLDB_LOG_ERROR(
+            GetLog(LLDBLog::Thread), std::move(err),
+            "Can't call ScriptedThreadPlan::GetStopDescription: {0}");
         s->Printf("Scripted thread plan implemented by class %s.",
                   m_class_name.c_str());
       } else

From 0fc3e4093ca5d226df37206626bfac3e4853b0db Mon Sep 17 00:00:00 2001
From: Ryosuke Niwa <rniwa@webkit.org>
Date: Thu, 10 Oct 2024 10:00:42 -0700
Subject: [PATCH 045/177] [alpha.webkit.UncountedCallArgsChecker] Skip
 std::forward in tryToFindPtrOrigin. (#111222)

Ignore std::forward when it appears while looking for the pointer
origin.
---
 .../StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp   |  5 +++++
 .../Checkers/WebKit/uncounted-obj-arg.cpp         | 15 +++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp
index 394cb26f03cf99..b7b2f8a16f07b3 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp
@@ -101,6 +101,11 @@ bool tryToFindPtrOrigin(
         if (isSingleton(callee))
           return callback(E, true);
 
+        if (callee->isInStdNamespace() && safeGetName(callee) == "forward") {
+          E = call->getArg(0);
+          continue;
+        }
+
         if (isPtrConversion(callee)) {
           E = call->getArg(0);
           continue;
diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp
index 97efb354f0371d..b6ab369f69a87d 100644
--- a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp
+++ b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp
@@ -588,6 +588,8 @@ class UnrelatedClass {
     getFieldTrivial().nonTrivial23();
     // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
   }
+
+  void setField(RefCounted*);
 };
 
 class UnrelatedClass2 {
@@ -598,11 +600,24 @@ class UnrelatedClass2 {
   RefCounted &getFieldTrivialRecursively() { return getFieldTrivial().getFieldTrivial(); }
   RefCounted *getFieldTrivialTernary() { return Field ? Field->getFieldTernary() : nullptr; }
 
+  template<typename T, typename ... AdditionalArgs>
+  void callSetField(T&& item, AdditionalArgs&&... args)
+  {
+    item.setField(std::forward<AdditionalArgs>(args)...);
+  }
+
+  template<typename T, typename ... AdditionalArgs>
+  void callSetField2(T&& item, AdditionalArgs&&... args)
+  {
+    item.setField(std::move<AdditionalArgs>(args)...);
+  }
+
   void test() {
     getFieldTrivialRecursively().trivial1(); // no-warning
     getFieldTrivialTernary()->trivial2(); // no-warning
     getFieldTrivialRecursively().someFunction();
     // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
+    callSetField(getFieldTrivial(), refCountedObj()); // no-warning
   }
 };
 

From 820bab8fb581f2fcd1a96b495f4762b02195d86a Mon Sep 17 00:00:00 2001
From: Ryosuke Niwa <rniwa@webkit.org>
Date: Thu, 10 Oct 2024 10:01:35 -0700
Subject: [PATCH 046/177] [alpha.webkit.UncountedCallArgsChecker] Add the
 support for trivial CXXInheritedCtorInitExpr. (#111198)

---
 .../Checkers/WebKit/PtrTypesSemantics.cpp     |  4 ++++
 .../Checkers/WebKit/uncounted-obj-arg.cpp     | 21 +++++++++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
index 4d145be808f6d8..317642c5b9ca20 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
@@ -508,6 +508,10 @@ class TrivialFunctionAnalysisVisitor
     return IsFunctionTrivial(CE->getConstructor());
   }
 
+  bool VisitCXXInheritedCtorInitExpr(const CXXInheritedCtorInitExpr *E) {
+    return IsFunctionTrivial(E->getConstructor());
+  }
+
   bool VisitCXXNewExpr(const CXXNewExpr *NE) { return VisitChildren(NE); }
 
   bool VisitImplicitCastExpr(const ImplicitCastExpr *ICE) {
diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp
index b6ab369f69a87d..1a42de90105a55 100644
--- a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp
+++ b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp
@@ -224,6 +224,20 @@ class ObjectWithMutatingDestructor {
   Number n;
 };
 
+class BaseType {
+public:
+  BaseType() : n(0) { }
+  BaseType(int v) : n(v) { }
+  BaseType(const char*);
+private:
+  Number n;
+};
+
+class SomeType : public BaseType {
+public:
+  using BaseType::BaseType;
+};
+
 class RefCounted {
 public:
   void ref() const;
@@ -336,6 +350,8 @@ class RefCounted {
   unsigned trivial60() { return ObjectWithNonTrivialDestructor { 5 }.value(); }
   unsigned trivial61() { return DerivedNumber('7').value(); }
   void trivial62() { WTFReportBacktrace(); }
+  SomeType trivial63() { return SomeType(0); }
+  SomeType trivial64() { return SomeType(); }
 
   static RefCounted& singleton() {
     static RefCounted s_RefCounted;
@@ -425,6 +441,7 @@ class RefCounted {
   unsigned nonTrivial21() { return Number("123").value(); }
   unsigned nonTrivial22() { return ComplexNumber(123, "456").real().value(); }
   unsigned nonTrivial23() { return DerivedNumber("123").value(); }
+  SomeType nonTrivial24() { return SomeType("123"); }
 
   static unsigned s_v;
   unsigned v { 0 };
@@ -515,6 +532,8 @@ class UnrelatedClass {
     getFieldTrivial().trivial60(); // no-warning
     getFieldTrivial().trivial61(); // no-warning
     getFieldTrivial().trivial62(); // no-warning
+    getFieldTrivial().trivial63(); // no-warning
+    getFieldTrivial().trivial64(); // no-warning
 
     RefCounted::singleton().trivial18(); // no-warning
     RefCounted::singleton().someFunction(); // no-warning
@@ -587,6 +606,8 @@ class UnrelatedClass {
     // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
     getFieldTrivial().nonTrivial23();
     // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
+    getFieldTrivial().nonTrivial24();
+    // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
   }
 
   void setField(RefCounted*);

From 39a91413c3f79181b4a45447bdb08d04d3efc975 Mon Sep 17 00:00:00 2001
From: Ryosuke Niwa <rniwa@webkit.org>
Date: Thu, 10 Oct 2024 10:02:07 -0700
Subject: [PATCH 047/177] isUncountedPtr should take QualType as an argument.
 (#110213)

Make isUncountedPtr take QualType as an argument instead of Type*. This
simplifies some code.
---
 .../Checkers/WebKit/PtrTypesSemantics.cpp        | 16 ++++------------
 .../Checkers/WebKit/PtrTypesSemantics.h          |  2 +-
 .../Checkers/WebKit/UncountedCallArgsChecker.cpp |  6 +-----
 .../WebKit/UncountedLambdaCapturesChecker.cpp    | 10 +++++-----
 .../WebKit/UncountedLocalVarsChecker.cpp         |  6 +-----
 5 files changed, 12 insertions(+), 28 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
index 317642c5b9ca20..2298fe39850de5 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
@@ -177,14 +177,10 @@ std::optional<bool> isUncounted(const CXXRecordDecl* Class)
   return (*IsRefCountable);
 }
 
-std::optional<bool> isUncountedPtr(const Type* T)
-{
-  assert(T);
-
+std::optional<bool> isUncountedPtr(const QualType T) {
   if (T->isPointerType() || T->isReferenceType()) {
-    if (auto *CXXRD = T->getPointeeCXXRecordDecl()) {
+    if (auto *CXXRD = T->getPointeeCXXRecordDecl())
       return isUncounted(CXXRD);
-    }
   }
   return false;
 }
@@ -208,12 +204,8 @@ std::optional<bool> isGetterOfRefCounted(const CXXMethodDecl* M)
     // Ref<T> -> T conversion
     // FIXME: Currently allowing any Ref<T> -> whatever cast.
     if (isRefType(className)) {
-      if (auto *maybeRefToRawOperator = dyn_cast<CXXConversionDecl>(M)) {
-        if (auto *targetConversionType =
-                maybeRefToRawOperator->getConversionType().getTypePtrOrNull()) {
-          return isUncountedPtr(targetConversionType);
-        }
-      }
+      if (auto *maybeRefToRawOperator = dyn_cast<CXXConversionDecl>(M))
+        return isUncountedPtr(maybeRefToRawOperator->getConversionType());
     }
   }
   return false;
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
index 3528c52a7d659d..8e6aadf63b6d67 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
@@ -61,7 +61,7 @@ std::optional<bool> isUncounted(const clang::CXXRecordDecl* Class);
 
 /// \returns true if \p T is either a raw pointer or reference to an uncounted
 /// class, false if not, std::nullopt if inconclusive.
-std::optional<bool> isUncountedPtr(const clang::Type* T);
+std::optional<bool> isUncountedPtr(const clang::QualType T);
 
 /// \returns true if Name is a RefPtr, Ref, or its variant, false if not.
 bool isRefType(const std::string &Name);
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp
index 0ed93ab26bf5ca..cea3503fa2c314 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp
@@ -115,12 +115,8 @@ class UncountedCallArgsChecker
         //  continue;
 
         QualType ArgType = (*P)->getType().getCanonicalType();
-        const auto *TypePtr = ArgType.getTypePtrOrNull();
-        if (!TypePtr)
-          continue; // FIXME? Should we bail?
-
         // FIXME: more complex types (arrays, references to raw pointers, etc)
-        std::optional<bool> IsUncounted = isUncountedPtr(TypePtr);
+        std::optional<bool> IsUncounted = isUncountedPtr(ArgType);
         if (!IsUncounted || !(*IsUncounted))
           continue;
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLambdaCapturesChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLambdaCapturesChecker.cpp
index a226a01ec0a579..998bd4ccee07db 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLambdaCapturesChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLambdaCapturesChecker.cpp
@@ -59,11 +59,11 @@ class UncountedLambdaCapturesChecker
     for (const LambdaCapture &C : L->captures()) {
       if (C.capturesVariable()) {
         ValueDecl *CapturedVar = C.getCapturedVar();
-        if (auto *CapturedVarType = CapturedVar->getType().getTypePtrOrNull()) {
-            std::optional<bool> IsUncountedPtr = isUncountedPtr(CapturedVarType);
-            if (IsUncountedPtr && *IsUncountedPtr) {
-                reportBug(C, CapturedVar, CapturedVarType);
-            }
+        QualType CapturedVarQualType = CapturedVar->getType();
+        if (auto *CapturedVarType = CapturedVarQualType.getTypePtrOrNull()) {
+          auto IsUncountedPtr = isUncountedPtr(CapturedVarQualType);
+          if (IsUncountedPtr && *IsUncountedPtr)
+            reportBug(C, CapturedVar, CapturedVarType);
         }
       }
     }
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLocalVarsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLocalVarsChecker.cpp
index 9d0a3bb5da7325..81d21100de878d 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLocalVarsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLocalVarsChecker.cpp
@@ -199,11 +199,7 @@ class UncountedLocalVarsChecker
     if (shouldSkipVarDecl(V))
       return;
 
-    const auto *ArgType = V->getType().getTypePtr();
-    if (!ArgType)
-      return;
-
-    std::optional<bool> IsUncountedPtr = isUncountedPtr(ArgType);
+    std::optional<bool> IsUncountedPtr = isUncountedPtr(V->getType());
     if (IsUncountedPtr && *IsUncountedPtr) {
       if (tryToFindPtrOrigin(
               Value, /*StopAtFirstRefCountedObj=*/false,

From 36c34ec967c28c77406fe85ef3237a167a243763 Mon Sep 17 00:00:00 2001
From: Abid Qadeer <haqadeer@amd.com>
Date: Thu, 10 Oct 2024 18:07:06 +0100
Subject: [PATCH 048/177] [mlir][debug] Support DICommonBlock. (#111706)

A COMMON block is a named area of memory that holds a collection of
variables. Fortran subprograms may map the COMMON block memory area to a
list of variables. A common block is represented in LLVM debug by
DICommonBlock.

This PR adds support for this in MLIR. The changes are mostly mechanical
apart from small change to access the DICompileUnit when the scope of
the variable is DICommonBlock.

---------

Co-authored-by: Tobias Gysi <tobias.gysi@nextsilicon.com>
---
 .../mlir/Dialect/LLVMIR/LLVMAttrDefs.td       | 16 ++++++++++
 mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp      | 20 +++++++------
 mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp    | 11 +++----
 mlir/lib/Target/LLVMIR/DebugImporter.cpp      |  9 ++++++
 mlir/lib/Target/LLVMIR/DebugImporter.h        |  1 +
 mlir/lib/Target/LLVMIR/DebugTranslation.cpp   | 20 +++++++++----
 mlir/lib/Target/LLVMIR/DebugTranslation.h     |  1 +
 mlir/lib/Target/LLVMIR/ModuleTranslation.cpp  | 28 ++++++++++-------
 mlir/test/Dialect/LLVMIR/debuginfo.mlir       |  8 +++++
 mlir/test/Target/LLVMIR/Import/debug-info.ll  | 24 +++++++++++++++
 mlir/test/Target/LLVMIR/llvmir-debug.mlir     | 30 +++++++++++++++++++
 11 files changed, 138 insertions(+), 30 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td
index c298c8277eb0c3..0d904f13037c61 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td
@@ -701,6 +701,22 @@ def LLVM_DISubrangeAttr : LLVM_Attr<"DISubrange", "di_subrange", /*traits=*/[],
   let assemblyFormat = "`<` struct(params) `>`";
 }
 
+//===----------------------------------------------------------------------===//
+// DICommonBlockAttr
+//===----------------------------------------------------------------------===//
+
+def LLVM_DICommonBlockAttr : LLVM_Attr<"DICommonBlock", "di_common_block",
+                                       /*traits=*/[], "DIScopeAttr"> {
+  let parameters = (ins
+    "DIScopeAttr":$scope,
+    OptionalParameter<"DIGlobalVariableAttr">:$decl,
+    "StringAttr":$name,
+    OptionalParameter<"DIFileAttr">:$file,
+    OptionalParameter<"unsigned">:$line
+  );
+  let assemblyFormat = "`<` struct(params) `>`";
+}
+
 //===----------------------------------------------------------------------===//
 // DISubroutineTypeAttr
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp
index 99871dac81d326..9640bbdf28df45 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMAttrs.cpp
@@ -56,13 +56,14 @@ void LLVMDialect::registerAttributes() {
 //===----------------------------------------------------------------------===//
 
 bool DINodeAttr::classof(Attribute attr) {
-  return llvm::isa<DIBasicTypeAttr, DICompileUnitAttr, DICompositeTypeAttr,
-                   DIDerivedTypeAttr, DIFileAttr, DIGlobalVariableAttr,
-                   DIImportedEntityAttr, DILabelAttr, DILexicalBlockAttr,
-                   DILexicalBlockFileAttr, DILocalVariableAttr, DIModuleAttr,
-                   DINamespaceAttr, DINullTypeAttr, DIAnnotationAttr,
-                   DIStringTypeAttr, DISubprogramAttr, DISubrangeAttr,
-                   DISubroutineTypeAttr>(attr);
+  return llvm::isa<DIBasicTypeAttr, DICommonBlockAttr, DICompileUnitAttr,
+                   DICompositeTypeAttr, DIDerivedTypeAttr, DIFileAttr,
+                   DIGlobalVariableAttr, DIImportedEntityAttr, DILabelAttr,
+                   DILexicalBlockAttr, DILexicalBlockFileAttr,
+                   DILocalVariableAttr, DIModuleAttr, DINamespaceAttr,
+                   DINullTypeAttr, DIAnnotationAttr, DIStringTypeAttr,
+                   DISubprogramAttr, DISubrangeAttr, DISubroutineTypeAttr>(
+      attr);
 }
 
 //===----------------------------------------------------------------------===//
@@ -70,8 +71,9 @@ bool DINodeAttr::classof(Attribute attr) {
 //===----------------------------------------------------------------------===//
 
 bool DIScopeAttr::classof(Attribute attr) {
-  return llvm::isa<DICompileUnitAttr, DICompositeTypeAttr, DIFileAttr,
-                   DILocalScopeAttr, DIModuleAttr, DINamespaceAttr>(attr);
+  return llvm::isa<DICommonBlockAttr, DICompileUnitAttr, DICompositeTypeAttr,
+                   DIFileAttr, DILocalScopeAttr, DIModuleAttr, DINamespaceAttr>(
+      attr);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index 2c7af8712d420c..006d412936a337 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -3369,11 +3369,12 @@ struct LLVMOpAsmDialectInterface : public OpAsmDialectInterface {
   AliasResult getAlias(Attribute attr, raw_ostream &os) const override {
     return TypeSwitch<Attribute, AliasResult>(attr)
         .Case<AccessGroupAttr, AliasScopeAttr, AliasScopeDomainAttr,
-              DIBasicTypeAttr, DICompileUnitAttr, DICompositeTypeAttr,
-              DIDerivedTypeAttr, DIFileAttr, DIGlobalVariableAttr,
-              DIGlobalVariableExpressionAttr, DIImportedEntityAttr, DILabelAttr,
-              DILexicalBlockAttr, DILexicalBlockFileAttr, DILocalVariableAttr,
-              DIModuleAttr, DINamespaceAttr, DINullTypeAttr, DIStringTypeAttr,
+              DIBasicTypeAttr, DICommonBlockAttr, DICompileUnitAttr,
+              DICompositeTypeAttr, DIDerivedTypeAttr, DIFileAttr,
+              DIGlobalVariableAttr, DIGlobalVariableExpressionAttr,
+              DIImportedEntityAttr, DILabelAttr, DILexicalBlockAttr,
+              DILexicalBlockFileAttr, DILocalVariableAttr, DIModuleAttr,
+              DINamespaceAttr, DINullTypeAttr, DIStringTypeAttr,
               DISubprogramAttr, DISubroutineTypeAttr, LoopAnnotationAttr,
               LoopVectorizeAttr, LoopInterleaveAttr, LoopUnrollAttr,
               LoopUnrollAndJamAttr, LoopLICMAttr, LoopDistributeAttr,
diff --git a/mlir/lib/Target/LLVMIR/DebugImporter.cpp b/mlir/lib/Target/LLVMIR/DebugImporter.cpp
index cd992be62b4719..412125b6ea65f6 100644
--- a/mlir/lib/Target/LLVMIR/DebugImporter.cpp
+++ b/mlir/lib/Target/LLVMIR/DebugImporter.cpp
@@ -302,6 +302,13 @@ DISubrangeAttr DebugImporter::translateImpl(llvm::DISubrange *node) {
                              getAttrOrNull(node->getStride()));
 }
 
+DICommonBlockAttr DebugImporter::translateImpl(llvm::DICommonBlock *node) {
+  return DICommonBlockAttr::get(context, translate(node->getScope()),
+                                translate(node->getDecl()),
+                                getStringAttrOrNull(node->getRawName()),
+                                translate(node->getFile()), node->getLineNo());
+}
+
 DISubroutineTypeAttr
 DebugImporter::translateImpl(llvm::DISubroutineType *node) {
   SmallVector<DITypeAttr> types;
@@ -339,6 +346,8 @@ DINodeAttr DebugImporter::translate(llvm::DINode *node) {
   auto translateNode = [this](llvm::DINode *node) -> DINodeAttr {
     if (auto *casted = dyn_cast<llvm::DIBasicType>(node))
       return translateImpl(casted);
+    if (auto *casted = dyn_cast<llvm::DICommonBlock>(node))
+      return translateImpl(casted);
     if (auto *casted = dyn_cast<llvm::DICompileUnit>(node))
       return translateImpl(casted);
     if (auto *casted = dyn_cast<llvm::DICompositeType>(node))
diff --git a/mlir/lib/Target/LLVMIR/DebugImporter.h b/mlir/lib/Target/LLVMIR/DebugImporter.h
index cb796676759c39..a452e01a9f6041 100644
--- a/mlir/lib/Target/LLVMIR/DebugImporter.h
+++ b/mlir/lib/Target/LLVMIR/DebugImporter.h
@@ -79,6 +79,7 @@ class DebugImporter {
   DIScopeAttr translateImpl(llvm::DIScope *node);
   DISubprogramAttr translateImpl(llvm::DISubprogram *node);
   DISubrangeAttr translateImpl(llvm::DISubrange *node);
+  DICommonBlockAttr translateImpl(llvm::DICommonBlock *node);
   DISubroutineTypeAttr translateImpl(llvm::DISubroutineType *node);
   DITypeAttr translateImpl(llvm::DIType *node);
 
diff --git a/mlir/lib/Target/LLVMIR/DebugTranslation.cpp b/mlir/lib/Target/LLVMIR/DebugTranslation.cpp
index 92ff079a10c8aa..2491db299af312 100644
--- a/mlir/lib/Target/LLVMIR/DebugTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/DebugTranslation.cpp
@@ -397,6 +397,13 @@ llvm::DISubrange *DebugTranslation::translateImpl(DISubrangeAttr attr) {
                                getMetadataOrNull(attr.getStride()));
 }
 
+llvm::DICommonBlock *DebugTranslation::translateImpl(DICommonBlockAttr attr) {
+  return llvm::DICommonBlock::get(llvmCtx, translate(attr.getScope()),
+                                  translate(attr.getDecl()),
+                                  getMDStringOrNull(attr.getName()),
+                                  translate(attr.getFile()), attr.getLine());
+}
+
 llvm::DISubroutineType *
 DebugTranslation::translateImpl(DISubroutineTypeAttr attr) {
   // Concatenate the result and argument types into a single array.
@@ -428,12 +435,13 @@ llvm::DINode *DebugTranslation::translate(DINodeAttr attr) {
 
   if (!node)
     node = TypeSwitch<DINodeAttr, llvm::DINode *>(attr)
-               .Case<DIBasicTypeAttr, DICompileUnitAttr, DICompositeTypeAttr,
-                     DIDerivedTypeAttr, DIFileAttr, DIGlobalVariableAttr,
-                     DIImportedEntityAttr, DILabelAttr, DILexicalBlockAttr,
-                     DILexicalBlockFileAttr, DILocalVariableAttr, DIModuleAttr,
-                     DINamespaceAttr, DINullTypeAttr, DIStringTypeAttr,
-                     DISubprogramAttr, DISubrangeAttr, DISubroutineTypeAttr>(
+               .Case<DIBasicTypeAttr, DICommonBlockAttr, DICompileUnitAttr,
+                     DICompositeTypeAttr, DIDerivedTypeAttr, DIFileAttr,
+                     DIGlobalVariableAttr, DIImportedEntityAttr, DILabelAttr,
+                     DILexicalBlockAttr, DILexicalBlockFileAttr,
+                     DILocalVariableAttr, DIModuleAttr, DINamespaceAttr,
+                     DINullTypeAttr, DIStringTypeAttr, DISubprogramAttr,
+                     DISubrangeAttr, DISubroutineTypeAttr>(
                    [&](auto attr) { return translateImpl(attr); });
 
   if (node && !node->isTemporary())
diff --git a/mlir/lib/Target/LLVMIR/DebugTranslation.h b/mlir/lib/Target/LLVMIR/DebugTranslation.h
index 422aa34e28f3c9..ff4eaa46c564e2 100644
--- a/mlir/lib/Target/LLVMIR/DebugTranslation.h
+++ b/mlir/lib/Target/LLVMIR/DebugTranslation.h
@@ -88,6 +88,7 @@ class DebugTranslation {
   llvm::DIScope *translateImpl(DIScopeAttr attr);
   llvm::DISubprogram *translateImpl(DISubprogramAttr attr);
   llvm::DISubrange *translateImpl(DISubrangeAttr attr);
+  llvm::DICommonBlock *translateImpl(DICommonBlockAttr attr);
   llvm::DISubroutineType *translateImpl(DISubroutineTypeAttr attr);
   llvm::DIType *translateImpl(DITypeAttr attr);
 
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index cc0de5bc838c99..a5de90160c4145 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -1064,19 +1064,27 @@ LogicalResult ModuleTranslation::convertGlobals() {
       // There is no `globals` field in DICompileUnitAttr which can be directly
       // assigned to DICompileUnit. We have to build the list by looking at the
       // dbgExpr of all the GlobalOps. The scope of the variable is used to get
-      // the DICompileUnit in which to add it. But for the languages that
-      // support modules, the scope hierarchy can be
-      // variable -> module -> compile unit
-      // If a variable scope points to the module then we use the scope of the
-      // module to get the compile unit.
-      // Global variables are also used for things like static local variables
-      // in C and local variables with the save attribute in Fortran. The scope
-      // of the variable is the parent function. We use the compile unit of the
-      // parent function in this case.
+      // the DICompileUnit in which to add it.
+      // But there are cases where the scope of a global does not
+      // directly point to the DICompileUnit and we have to do a bit more work
+      // to get to it. Some of those cases are:
+      //
+      // 1. For the languages that support modules, the scope hierarchy can be
+      // variable -> DIModule -> DICompileUnit
+      //
+      // 2. For the Fortran common block variable, the scope hierarchy can be
+      // variable -> DICommonBlock -> DISubprogram -> DICompileUnit
+      //
+      // 3. For entities like static local variables in C or variable with
+      // SAVE attribute in Fortran, the scope hierarchy can be
+      // variable -> DISubprogram -> DICompileUnit
       llvm::DIScope *scope = diGlobalVar->getScope();
       if (auto *mod = dyn_cast_if_present<llvm::DIModule>(scope))
         scope = mod->getScope();
-      else if (auto *sp = dyn_cast_if_present<llvm::DISubprogram>(scope))
+      else if (auto *cb = dyn_cast_if_present<llvm::DICommonBlock>(scope)) {
+        if (auto *sp = dyn_cast_if_present<llvm::DISubprogram>(cb->getScope()))
+          scope = sp->getUnit();
+      } else if (auto *sp = dyn_cast_if_present<llvm::DISubprogram>(scope))
         scope = sp->getUnit();
 
       // Get the compile unit (scope) of the the global variable.
diff --git a/mlir/test/Dialect/LLVMIR/debuginfo.mlir b/mlir/test/Dialect/LLVMIR/debuginfo.mlir
index af95ec97833a13..8475ec6c3510db 100644
--- a/mlir/test/Dialect/LLVMIR/debuginfo.mlir
+++ b/mlir/test/Dialect/LLVMIR/debuginfo.mlir
@@ -156,6 +156,14 @@
 // CHECK-DAG: #[[LABEL2:.*]] =  #llvm.di_label<scope = #[[BLOCK2]]>
 #label2 = #llvm.di_label<scope = #block2>
 
+// CHECK-DAG: #llvm.di_common_block<scope = #[[SP1]], name = "block", file = #[[FILE]], line = 3>
+#di_common_block = #llvm.di_common_block<scope = #sp1, name = "block", file = #file, line = 3>
+#global_var = #llvm.di_global_variable<scope = #di_common_block, name = "a",
+ file = #file, line = 2, type = #int0>
+#var_expression = #llvm.di_global_variable_expression<var = #global_var,
+ expr = <>>
+llvm.mlir.global common @block_() {dbg_expr = #var_expression} : i64
+
 // CHECK: llvm.func @addr(%[[ARG:.*]]: i64)
 llvm.func @addr(%arg: i64) {
   // CHECK: %[[ALLOC:.*]] = llvm.alloca
diff --git a/mlir/test/Target/LLVMIR/Import/debug-info.ll b/mlir/test/Target/LLVMIR/Import/debug-info.ll
index 6267990b0bf803..09909d7d63b2ab 100644
--- a/mlir/test/Target/LLVMIR/Import/debug-info.ll
+++ b/mlir/test/Target/LLVMIR/Import/debug-info.ll
@@ -843,3 +843,27 @@ define void @fn_with_annotations() !dbg !12 {
 
 
 ; CHECK-DAG: #llvm.di_subprogram<{{.*}}name = "fn_with_annotations"{{.*}}annotations = #llvm.di_annotation<name = "foo", value = "bar">>
+
+; // -----
+
+@block = common global [4 x i8] zeroinitializer, !dbg !0
+
+define void @test() !dbg !3 {
+  ret void
+}
+
+!llvm.module.flags = !{!10}
+!llvm.dbg.cu = !{!7}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "alpha", scope: !2, file: !4, type: !9)
+!2 = !DICommonBlock(scope: !3, declaration: null, name: "block", file: !4, line: 3)
+!3 = distinct !DISubprogram(name: "test", scope: !4, file: !4, spFlags: DISPFlagDefinition, unit: !7)
+!4 = !DIFile(filename: "test.f90", directory: "")
+!7 = distinct !DICompileUnit(language: DW_LANG_Fortran95, file: !4)
+!9 = !DIBasicType(name: "integer", size: 32, encoding: DW_ATE_signed)
+!10 = !{i32 2, !"Debug Info Version", i32 3}
+
+; CHECK: #[[FILE:.+]] = #llvm.di_file<"test.f90" in "">
+; CHECK: #[[SP:.+]] = #llvm.di_subprogram<{{.*}}name = "test"{{.*}}>
+; CHECK: #llvm.di_common_block<scope = #[[SP]], name = "block", file = #[[FILE]], line = 3>
diff --git a/mlir/test/Target/LLVMIR/llvmir-debug.mlir b/mlir/test/Target/LLVMIR/llvmir-debug.mlir
index b09a60b8dcac90..826fda60c5efef 100644
--- a/mlir/test/Target/LLVMIR/llvmir-debug.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-debug.mlir
@@ -660,3 +660,33 @@ llvm.func @string_ty(%arg0: !llvm.ptr) {
 
 // CHECK-DAG: !DIStringType(name: "character(*)", stringLength: ![[VAR:[0-9]+]], stringLengthExpression: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 8), stringLocationExpression: !DIExpression(DW_OP_push_object_address, DW_OP_deref), size: 32, align: 8)
 // CHECK-DAG: ![[VAR]] = !DILocalVariable(name: "string_size"{{.*}} flags: DIFlagArtificial)
+
+// -----
+
+// Test translation of DICommonBlockAttr.
+#bt = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "int", sizeInBits = 32>
+#file = #llvm.di_file<"test.f90" in "">
+#cu = #llvm.di_compile_unit<id = distinct[0]<>, sourceLanguage = DW_LANG_C,
+ file = #file, isOptimized = false, emissionKind = Full>
+#sp = #llvm.di_subprogram<compileUnit = #cu, scope = #file, name = "test",
+ file = #file, subprogramFlags = Definition>
+#di_common_block = #llvm.di_common_block<scope = #sp, name = "block",
+ file = #file, line = 3>
+#global_var = #llvm.di_global_variable<scope = #di_common_block, name = "a",
+ file = #file, line = 2, type = #bt>
+#var_expression = #llvm.di_global_variable_expression<var = #global_var,
+ expr = <>>
+
+llvm.mlir.global common @block_(dense<0> : tensor<8xi8>)
+  {dbg_expr = #var_expression} : !llvm.array<8 x i8>
+
+llvm.func @test() {
+  llvm.return
+} loc(#loc2)
+
+#loc1 = loc("test.f90":1:0)
+#loc2 = loc(fused<#sp>[#loc1])
+
+// CHECK: !DICommonBlock(scope: ![[SCOPE:[0-9]+]], declaration: null, name: "block", file: ![[FILE:[0-9]+]], line: 3)
+// CHECK: ![[SCOPE]] = {{.*}}!DISubprogram(name: "test"{{.*}})
+// CHECK: ![[FILE]] = !DIFile(filename: "test.f90"{{.*}})

From 43ba97e7079525a9686e15a6963508dfbd493f81 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Thu, 10 Oct 2024 13:13:17 -0400
Subject: [PATCH 049/177] [runtimes][NFC] Reindent CMake files (#111821)

This is a purely mechanical commit for fixing the indentation of the
runtimes' CMakeLists files after #80007. That PR didn't update the
indentation in order to make the diff easier to review and for merge
conflicts to be easier to resolve (for downstream changes).

This doesn't change any code, it only reindents it.
---
 libcxx/src/CMakeLists.txt    | 194 +++++++++++++++++------------------
 libcxxabi/src/CMakeLists.txt | 140 ++++++++++++-------------
 libunwind/src/CMakeLists.txt |  40 ++++----
 3 files changed, 187 insertions(+), 187 deletions(-)

diff --git a/libcxx/src/CMakeLists.txt b/libcxx/src/CMakeLists.txt
index 9f31822065be9d..4af04f202db1f7 100644
--- a/libcxx/src/CMakeLists.txt
+++ b/libcxx/src/CMakeLists.txt
@@ -173,76 +173,76 @@ split_list(LIBCXX_COMPILE_FLAGS)
 split_list(LIBCXX_LINK_FLAGS)
 
 # Build the shared library.
-  add_library(cxx_shared SHARED ${LIBCXX_SOURCES} ${LIBCXX_HEADERS})
-  target_include_directories(cxx_shared PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
-  target_link_libraries(cxx_shared PUBLIC cxx-headers libcxx-libc-shared
-                                   PRIVATE ${LIBCXX_LIBRARIES})
-  set_target_properties(cxx_shared
-    PROPERTIES
-      EXCLUDE_FROM_ALL "$<IF:$<BOOL:${LIBCXX_ENABLE_SHARED}>,FALSE,TRUE>"
-      COMPILE_FLAGS "${LIBCXX_COMPILE_FLAGS}"
-      LINK_FLAGS    "${LIBCXX_LINK_FLAGS}"
-      OUTPUT_NAME   "${LIBCXX_SHARED_OUTPUT_NAME}"
-      VERSION       "${LIBCXX_LIBRARY_VERSION}"
-      SOVERSION     "${LIBCXX_ABI_VERSION}"
-      DEFINE_SYMBOL ""
+add_library(cxx_shared SHARED ${LIBCXX_SOURCES} ${LIBCXX_HEADERS})
+target_include_directories(cxx_shared PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
+target_link_libraries(cxx_shared PUBLIC cxx-headers libcxx-libc-shared
+                                  PRIVATE ${LIBCXX_LIBRARIES})
+set_target_properties(cxx_shared
+  PROPERTIES
+    EXCLUDE_FROM_ALL "$<IF:$<BOOL:${LIBCXX_ENABLE_SHARED}>,FALSE,TRUE>"
+    COMPILE_FLAGS "${LIBCXX_COMPILE_FLAGS}"
+    LINK_FLAGS    "${LIBCXX_LINK_FLAGS}"
+    OUTPUT_NAME   "${LIBCXX_SHARED_OUTPUT_NAME}"
+    VERSION       "${LIBCXX_LIBRARY_VERSION}"
+    SOVERSION     "${LIBCXX_ABI_VERSION}"
+    DEFINE_SYMBOL ""
+)
+cxx_add_common_build_flags(cxx_shared)
+
+if(ZOS)
+  add_custom_command(TARGET cxx_shared POST_BUILD
+    COMMAND
+      ${LIBCXX_SOURCE_DIR}/utils/zos_rename_dll_side_deck.sh
+      $<TARGET_LINKER_FILE_NAME:cxx_shared> $<TARGET_FILE_NAME:cxx_shared> "${LIBCXX_DLL_NAME}"
+    COMMENT "Rename dll name inside the side deck file"
+    WORKING_DIRECTORY $<TARGET_FILE_DIR:cxx_shared>
   )
-  cxx_add_common_build_flags(cxx_shared)
-
-  if(ZOS)
-    add_custom_command(TARGET cxx_shared POST_BUILD
-      COMMAND
-        ${LIBCXX_SOURCE_DIR}/utils/zos_rename_dll_side_deck.sh
-        $<TARGET_LINKER_FILE_NAME:cxx_shared> $<TARGET_FILE_NAME:cxx_shared> "${LIBCXX_DLL_NAME}"
-      COMMENT "Rename dll name inside the side deck file"
-      WORKING_DIRECTORY $<TARGET_FILE_DIR:cxx_shared>
-    )
-  endif()
+endif()
 
-  # Link against libc++abi
-  if (LIBCXX_STATICALLY_LINK_ABI_IN_SHARED_LIBRARY)
-    target_link_libraries(cxx_shared PRIVATE libcxx-abi-shared-objects)
-  else()
-    target_link_libraries(cxx_shared PUBLIC libcxx-abi-shared)
-  endif()
+# Link against libc++abi
+if (LIBCXX_STATICALLY_LINK_ABI_IN_SHARED_LIBRARY)
+  target_link_libraries(cxx_shared PRIVATE libcxx-abi-shared-objects)
+else()
+  target_link_libraries(cxx_shared PUBLIC libcxx-abi-shared)
+endif()
 
-  # Maybe force some symbols to be weak, not weak or not exported.
-  # TODO: This shouldn't depend on the platform, and ideally it should be done in the sources.
-  if (APPLE AND LIBCXX_CXX_ABI MATCHES "libcxxabi$"
-            AND NOT LIBCXX_STATICALLY_LINK_ABI_IN_SHARED_LIBRARY)
-    target_link_libraries(cxx_shared PRIVATE
-      "-Wl,-force_symbols_not_weak_list,${CMAKE_CURRENT_SOURCE_DIR}/../lib/notweak.exp"
-      "-Wl,-force_symbols_weak_list,${CMAKE_CURRENT_SOURCE_DIR}/../lib/weak.exp")
-  endif()
+# Maybe force some symbols to be weak, not weak or not exported.
+# TODO: This shouldn't depend on the platform, and ideally it should be done in the sources.
+if (APPLE AND LIBCXX_CXX_ABI MATCHES "libcxxabi$"
+          AND NOT LIBCXX_STATICALLY_LINK_ABI_IN_SHARED_LIBRARY)
+  target_link_libraries(cxx_shared PRIVATE
+    "-Wl,-force_symbols_not_weak_list,${CMAKE_CURRENT_SOURCE_DIR}/../lib/notweak.exp"
+    "-Wl,-force_symbols_weak_list,${CMAKE_CURRENT_SOURCE_DIR}/../lib/weak.exp")
+endif()
 
-  # Generate a linker script in place of a libc++.so symlink.
-  if (LIBCXX_ENABLE_ABI_LINKER_SCRIPT)
-    set(link_libraries)
-
-    set(imported_libname "$<TARGET_PROPERTY:libcxx-abi-shared,IMPORTED_LIBNAME>")
-    set(output_name "$<TARGET_PROPERTY:libcxx-abi-shared,OUTPUT_NAME>")
-    string(APPEND link_libraries "${CMAKE_LINK_LIBRARY_FLAG}$<IF:$<BOOL:${imported_libname}>,${imported_libname},${output_name}>")
-
-    # TODO: Move to the same approach as above for the unwind library
-    if (LIBCXXABI_USE_LLVM_UNWINDER)
-      if (LIBCXXABI_STATICALLY_LINK_UNWINDER_IN_SHARED_LIBRARY)
-        # libunwind is already included in libc++abi
-      elseif (TARGET unwind_shared OR HAVE_LIBUNWIND)
-        string(APPEND link_libraries " ${CMAKE_LINK_LIBRARY_FLAG}$<TARGET_PROPERTY:unwind_shared,OUTPUT_NAME>")
-      else()
-        string(APPEND link_libraries " ${CMAKE_LINK_LIBRARY_FLAG}unwind")
-      endif()
-    endif()
+# Generate a linker script in place of a libc++.so symlink.
+if (LIBCXX_ENABLE_ABI_LINKER_SCRIPT)
+  set(link_libraries)
 
-    set(linker_script "INPUT($<TARGET_SONAME_FILE_NAME:cxx_shared> ${link_libraries})")
-    add_custom_command(TARGET cxx_shared POST_BUILD
-      COMMAND "${CMAKE_COMMAND}" -E remove "$<TARGET_LINKER_FILE:cxx_shared>"
-      COMMAND "${CMAKE_COMMAND}" -E echo "${linker_script}" > "$<TARGET_LINKER_FILE:cxx_shared>"
-      COMMENT "Generating linker script: '${linker_script}' as file $<TARGET_LINKER_FILE:cxx_shared>"
-      VERBATIM
-    )
+  set(imported_libname "$<TARGET_PROPERTY:libcxx-abi-shared,IMPORTED_LIBNAME>")
+  set(output_name "$<TARGET_PROPERTY:libcxx-abi-shared,OUTPUT_NAME>")
+  string(APPEND link_libraries "${CMAKE_LINK_LIBRARY_FLAG}$<IF:$<BOOL:${imported_libname}>,${imported_libname},${output_name}>")
+
+  # TODO: Move to the same approach as above for the unwind library
+  if (LIBCXXABI_USE_LLVM_UNWINDER)
+    if (LIBCXXABI_STATICALLY_LINK_UNWINDER_IN_SHARED_LIBRARY)
+      # libunwind is already included in libc++abi
+    elseif (TARGET unwind_shared OR HAVE_LIBUNWIND)
+      string(APPEND link_libraries " ${CMAKE_LINK_LIBRARY_FLAG}$<TARGET_PROPERTY:unwind_shared,OUTPUT_NAME>")
+    else()
+      string(APPEND link_libraries " ${CMAKE_LINK_LIBRARY_FLAG}unwind")
+    endif()
   endif()
 
+  set(linker_script "INPUT($<TARGET_SONAME_FILE_NAME:cxx_shared> ${link_libraries})")
+  add_custom_command(TARGET cxx_shared POST_BUILD
+    COMMAND "${CMAKE_COMMAND}" -E remove "$<TARGET_LINKER_FILE:cxx_shared>"
+    COMMAND "${CMAKE_COMMAND}" -E echo "${linker_script}" > "$<TARGET_LINKER_FILE:cxx_shared>"
+    COMMENT "Generating linker script: '${linker_script}' as file $<TARGET_LINKER_FILE:cxx_shared>"
+    VERBATIM
+  )
+endif()
+
 if (LIBCXX_ENABLE_SHARED)
   list(APPEND LIBCXX_BUILD_TARGETS "cxx_shared")
 endif()
@@ -263,43 +263,43 @@ endif()
 set(CMAKE_STATIC_LIBRARY_PREFIX "lib")
 
 # Build the static library.
-  add_library(cxx_static STATIC ${LIBCXX_SOURCES} ${LIBCXX_HEADERS})
-  target_include_directories(cxx_static PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
-  target_link_libraries(cxx_static PUBLIC cxx-headers libcxx-libc-static
-                                   PRIVATE ${LIBCXX_LIBRARIES}
-                                   PRIVATE libcxx-abi-static)
-  set_target_properties(cxx_static
-    PROPERTIES
-      EXCLUDE_FROM_ALL "$<IF:$<BOOL:${LIBCXX_ENABLE_STATIC}>,FALSE,TRUE>"
-      COMPILE_FLAGS "${LIBCXX_COMPILE_FLAGS}"
-      LINK_FLAGS    "${LIBCXX_LINK_FLAGS}"
-      OUTPUT_NAME   "${LIBCXX_STATIC_OUTPUT_NAME}"
-  )
-  cxx_add_common_build_flags(cxx_static)
-
-  if (LIBCXX_HERMETIC_STATIC_LIBRARY)
-    # If the hermetic library doesn't define the operator new/delete functions
-    # then its code shouldn't declare them with hidden visibility.  They might
-    # actually be provided by a shared library at link time.
-    if (LIBCXX_ENABLE_NEW_DELETE_DEFINITIONS)
-      append_flags_if_supported(CXX_STATIC_LIBRARY_FLAGS -fvisibility-global-new-delete=force-hidden)
-      if (NOT CXX_SUPPORTS_FVISIBILITY_GLOBAL_NEW_DELETE_EQ_FORCE_HIDDEN_FLAG)
-        append_flags_if_supported(CXX_STATIC_LIBRARY_FLAGS -fvisibility-global-new-delete-hidden)
-      endif()
+add_library(cxx_static STATIC ${LIBCXX_SOURCES} ${LIBCXX_HEADERS})
+target_include_directories(cxx_static PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
+target_link_libraries(cxx_static PUBLIC cxx-headers libcxx-libc-static
+                                  PRIVATE ${LIBCXX_LIBRARIES}
+                                  PRIVATE libcxx-abi-static)
+set_target_properties(cxx_static
+  PROPERTIES
+    EXCLUDE_FROM_ALL "$<IF:$<BOOL:${LIBCXX_ENABLE_STATIC}>,FALSE,TRUE>"
+    COMPILE_FLAGS "${LIBCXX_COMPILE_FLAGS}"
+    LINK_FLAGS    "${LIBCXX_LINK_FLAGS}"
+    OUTPUT_NAME   "${LIBCXX_STATIC_OUTPUT_NAME}"
+)
+cxx_add_common_build_flags(cxx_static)
+
+if (LIBCXX_HERMETIC_STATIC_LIBRARY)
+  # If the hermetic library doesn't define the operator new/delete functions
+  # then its code shouldn't declare them with hidden visibility.  They might
+  # actually be provided by a shared library at link time.
+  if (LIBCXX_ENABLE_NEW_DELETE_DEFINITIONS)
+    append_flags_if_supported(CXX_STATIC_LIBRARY_FLAGS -fvisibility-global-new-delete=force-hidden)
+    if (NOT CXX_SUPPORTS_FVISIBILITY_GLOBAL_NEW_DELETE_EQ_FORCE_HIDDEN_FLAG)
+      append_flags_if_supported(CXX_STATIC_LIBRARY_FLAGS -fvisibility-global-new-delete-hidden)
     endif()
-    target_compile_options(cxx_static PRIVATE ${CXX_STATIC_LIBRARY_FLAGS})
-    # _LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS can be defined in __config_site
-    # too. Define it in the same way here, to avoid redefinition conflicts.
-    target_compile_definitions(cxx_static PRIVATE _LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS=)
   endif()
+  target_compile_options(cxx_static PRIVATE ${CXX_STATIC_LIBRARY_FLAGS})
+  # _LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS can be defined in __config_site
+  # too. Define it in the same way here, to avoid redefinition conflicts.
+  target_compile_definitions(cxx_static PRIVATE _LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS=)
+endif()
 
-  if (LIBCXX_ENABLE_STATIC)
-    list(APPEND LIBCXX_BUILD_TARGETS "cxx_static")
-  endif()
-  # Attempt to merge the libc++.a archive and the ABI library archive into one.
-  if (LIBCXX_STATICALLY_LINK_ABI_IN_STATIC_LIBRARY)
-    target_link_libraries(cxx_static PRIVATE libcxx-abi-static-objects)
-  endif()
+if (LIBCXX_ENABLE_STATIC)
+  list(APPEND LIBCXX_BUILD_TARGETS "cxx_static")
+endif()
+# Attempt to merge the libc++.a archive and the ABI library archive into one.
+if (LIBCXX_STATICALLY_LINK_ABI_IN_STATIC_LIBRARY)
+  target_link_libraries(cxx_static PRIVATE libcxx-abi-static-objects)
+endif()
 
 # Add a meta-target for both libraries.
 add_custom_target(cxx DEPENDS ${LIBCXX_BUILD_TARGETS})
diff --git a/libcxxabi/src/CMakeLists.txt b/libcxxabi/src/CMakeLists.txt
index e496cf3339164e..84fe2784bec5ca 100644
--- a/libcxxabi/src/CMakeLists.txt
+++ b/libcxxabi/src/CMakeLists.txt
@@ -184,78 +184,78 @@ if (CMAKE_POSITION_INDEPENDENT_CODE OR NOT DEFINED CMAKE_POSITION_INDEPENDENT_CO
 endif()
 target_compile_options(cxxabi_shared_objects PRIVATE "${LIBCXXABI_ADDITIONAL_COMPILE_FLAGS}")
 
-  add_library(cxxabi_shared SHARED)
-  set_target_properties(cxxabi_shared
-    PROPERTIES
-      EXCLUDE_FROM_ALL "$<IF:$<BOOL:${LIBCXXABI_ENABLE_SHARED}>,FALSE,TRUE>"
-      LINK_FLAGS "${LIBCXXABI_LINK_FLAGS}"
-      OUTPUT_NAME "${LIBCXXABI_SHARED_OUTPUT_NAME}"
-      SOVERSION "1"
-      VERSION "${LIBCXXABI_LIBRARY_VERSION}"
-  )
+add_library(cxxabi_shared SHARED)
+set_target_properties(cxxabi_shared
+  PROPERTIES
+    EXCLUDE_FROM_ALL "$<IF:$<BOOL:${LIBCXXABI_ENABLE_SHARED}>,FALSE,TRUE>"
+    LINK_FLAGS "${LIBCXXABI_LINK_FLAGS}"
+    OUTPUT_NAME "${LIBCXXABI_SHARED_OUTPUT_NAME}"
+    SOVERSION "1"
+    VERSION "${LIBCXXABI_LIBRARY_VERSION}"
+)
 
-  if (ZOS)
-    add_custom_command(TARGET cxxabi_shared POST_BUILD
-      COMMAND
-        ${LIBCXXABI_LIBCXX_PATH}/utils/zos_rename_dll_side_deck.sh
-        $<TARGET_LINKER_FILE_NAME:cxxabi_shared> $<TARGET_FILE_NAME:cxxabi_shared> "${LIBCXXABI_DLL_NAME}"
-      COMMENT "Rename dll name inside the side deck file"
-      WORKING_DIRECTORY $<TARGET_FILE_DIR:cxxabi_shared>
-    )
-  endif ()
+if (ZOS)
+  add_custom_command(TARGET cxxabi_shared POST_BUILD
+    COMMAND
+      ${LIBCXXABI_LIBCXX_PATH}/utils/zos_rename_dll_side_deck.sh
+      $<TARGET_LINKER_FILE_NAME:cxxabi_shared> $<TARGET_FILE_NAME:cxxabi_shared> "${LIBCXXABI_DLL_NAME}"
+    COMMENT "Rename dll name inside the side deck file"
+    WORKING_DIRECTORY $<TARGET_FILE_DIR:cxxabi_shared>
+  )
+endif ()
 
-  target_link_libraries(cxxabi_shared
-    PUBLIC cxxabi_shared_objects
-    PRIVATE ${LIBCXXABI_LIBRARIES})
+target_link_libraries(cxxabi_shared
+  PUBLIC cxxabi_shared_objects
+  PRIVATE ${LIBCXXABI_LIBRARIES})
 
 if (LIBCXXABI_ENABLE_SHARED)
-  list(APPEND LIBCXXABI_BUILD_TARGETS "cxxabi_shared")
+list(APPEND LIBCXXABI_BUILD_TARGETS "cxxabi_shared")
 endif()
 if (LIBCXXABI_INSTALL_SHARED_LIBRARY)
-  list(APPEND LIBCXXABI_INSTALL_TARGETS "cxxabi_shared")
+list(APPEND LIBCXXABI_INSTALL_TARGETS "cxxabi_shared")
 endif()
 
-  # TODO: Move this to libc++'s HandleLibCXXABI.cmake since this is effectively trying to control
-  #       what libc++ re-exports.
-  add_library(cxxabi-reexports INTERFACE)
-  function(export_symbols file)
-    # -exported_symbols_list is only available on Apple platforms
-    if (APPLE)
-      target_link_libraries(cxxabi_shared PRIVATE "-Wl,-exported_symbols_list,${file}")
-    endif()
-  endfunction()
+# TODO: Move this to libc++'s HandleLibCXXABI.cmake since this is effectively trying to control
+#       what libc++ re-exports.
+add_library(cxxabi-reexports INTERFACE)
+function(export_symbols file)
+  # -exported_symbols_list is only available on Apple platforms
+  if (APPLE)
+    target_link_libraries(cxxabi_shared PRIVATE "-Wl,-exported_symbols_list,${file}")
+  endif()
+endfunction()
 
-  function(reexport_symbols file)
-    export_symbols("${file}")
-    # -reexported_symbols_list is only available on Apple platforms
-    if (APPLE)
-      target_link_libraries(cxxabi-reexports INTERFACE "-Wl,-reexported_symbols_list,${file}")
-    endif()
-  endfunction()
+function(reexport_symbols file)
+  export_symbols("${file}")
+  # -reexported_symbols_list is only available on Apple platforms
+  if (APPLE)
+    target_link_libraries(cxxabi-reexports INTERFACE "-Wl,-reexported_symbols_list,${file}")
+  endif()
+endfunction()
 
-  export_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/symbols-not-reexported.exp")
-  reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/cxxabiv1.exp")
-  reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/fundamental-types.exp")
-  reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/itanium-base.exp")
-  reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/std-misc.exp")
+export_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/symbols-not-reexported.exp")
+reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/cxxabiv1.exp")
+reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/fundamental-types.exp")
+reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/itanium-base.exp")
+reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/std-misc.exp")
 
-  if (LIBCXXABI_ENABLE_NEW_DELETE_DEFINITIONS)
-    reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/new-delete.exp")
-  endif()
+if (LIBCXXABI_ENABLE_NEW_DELETE_DEFINITIONS)
+  reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/new-delete.exp")
+endif()
 
-  # Note that std:: exception types are always defined by the library regardless of
-  # whether the exception runtime machinery is provided.
-  reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/std-exceptions.exp")
+# Note that std:: exception types are always defined by the library regardless of
+# whether the exception runtime machinery is provided.
+reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/std-exceptions.exp")
 
-  if (LIBCXXABI_ENABLE_EXCEPTIONS)
-    reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/itanium-exceptions.exp")
+if (LIBCXXABI_ENABLE_EXCEPTIONS)
+  reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/itanium-exceptions.exp")
 
-    if ("${CMAKE_OSX_ARCHITECTURES}" MATCHES "^(armv6|armv7|armv7s)$")
-      reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/personality-sjlj.exp")
-    else()
-      reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/personality-v0.exp")
-    endif()
+  if ("${CMAKE_OSX_ARCHITECTURES}" MATCHES "^(armv6|armv7|armv7s)$")
+    reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/personality-sjlj.exp")
+  else()
+    reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/personality-v0.exp")
   endif()
+endif()
 
 # Build the static library.
 add_library(cxxabi_static_objects OBJECT EXCLUDE_FROM_ALL ${LIBCXXABI_SOURCES} ${LIBCXXABI_HEADERS})
@@ -295,19 +295,19 @@ if(LIBCXXABI_HERMETIC_STATIC_LIBRARY)
       _LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS=)
 endif()
 
-  add_library(cxxabi_static STATIC)
-  if (LIBCXXABI_USE_LLVM_UNWINDER AND NOT LIBCXXABI_STATICALLY_LINK_UNWINDER_IN_STATIC_LIBRARY)
-    target_link_libraries(cxxabi_static PUBLIC unwind_static)
-  endif()
-  set_target_properties(cxxabi_static
-    PROPERTIES
-      EXCLUDE_FROM_ALL "$<IF:$<BOOL:${LIBCXXABI_ENABLE_STATIC}>,FALSE,TRUE>"
-      LINK_FLAGS "${LIBCXXABI_LINK_FLAGS}"
-      OUTPUT_NAME "${LIBCXXABI_STATIC_OUTPUT_NAME}"
-    )
-  target_link_libraries(cxxabi_static
-    PUBLIC cxxabi_static_objects
-    PRIVATE ${LIBCXXABI_STATIC_LIBRARIES} ${LIBCXXABI_LIBRARIES})
+add_library(cxxabi_static STATIC)
+if (LIBCXXABI_USE_LLVM_UNWINDER AND NOT LIBCXXABI_STATICALLY_LINK_UNWINDER_IN_STATIC_LIBRARY)
+  target_link_libraries(cxxabi_static PUBLIC unwind_static)
+endif()
+set_target_properties(cxxabi_static
+  PROPERTIES
+    EXCLUDE_FROM_ALL "$<IF:$<BOOL:${LIBCXXABI_ENABLE_STATIC}>,FALSE,TRUE>"
+    LINK_FLAGS "${LIBCXXABI_LINK_FLAGS}"
+    OUTPUT_NAME "${LIBCXXABI_STATIC_OUTPUT_NAME}"
+  )
+target_link_libraries(cxxabi_static
+  PUBLIC cxxabi_static_objects
+  PRIVATE ${LIBCXXABI_STATIC_LIBRARIES} ${LIBCXXABI_LIBRARIES})
 
 if (LIBCXXABI_ENABLE_STATIC)
   list(APPEND LIBCXXABI_BUILD_TARGETS "cxxabi_static")
diff --git a/libunwind/src/CMakeLists.txt b/libunwind/src/CMakeLists.txt
index 3065bfc8a07050..2e18b109656331 100644
--- a/libunwind/src/CMakeLists.txt
+++ b/libunwind/src/CMakeLists.txt
@@ -153,17 +153,17 @@ if (CMAKE_POSITION_INDEPENDENT_CODE OR NOT DEFINED CMAKE_POSITION_INDEPENDENT_CO
   set_target_properties(unwind_shared_objects PROPERTIES POSITION_INDEPENDENT_CODE ON) # must set manually because it's an object library
 endif()
 
-  add_library(unwind_shared SHARED)
-  target_link_libraries(unwind_shared PUBLIC unwind_shared_objects)
-  set_target_properties(unwind_shared
-    PROPERTIES
-      EXCLUDE_FROM_ALL "$<IF:$<BOOL:${LIBUNWIND_ENABLE_SHARED}>,FALSE,TRUE>"
-      LINK_FLAGS "${LIBUNWIND_LINK_FLAGS}"
-      LINKER_LANGUAGE C
-      OUTPUT_NAME "${LIBUNWIND_SHARED_OUTPUT_NAME}"
-      VERSION     "${LIBUNWIND_LIBRARY_VERSION}"
-      SOVERSION   "1"
-  )
+add_library(unwind_shared SHARED)
+target_link_libraries(unwind_shared PUBLIC unwind_shared_objects)
+set_target_properties(unwind_shared
+  PROPERTIES
+    EXCLUDE_FROM_ALL "$<IF:$<BOOL:${LIBUNWIND_ENABLE_SHARED}>,FALSE,TRUE>"
+    LINK_FLAGS "${LIBUNWIND_LINK_FLAGS}"
+    LINKER_LANGUAGE C
+    OUTPUT_NAME "${LIBUNWIND_SHARED_OUTPUT_NAME}"
+    VERSION     "${LIBUNWIND_LIBRARY_VERSION}"
+    SOVERSION   "1"
+)
 
 if (LIBUNWIND_ENABLE_SHARED)
   list(APPEND LIBUNWIND_BUILD_TARGETS "unwind_shared")
@@ -200,15 +200,15 @@ if(LIBUNWIND_HIDE_SYMBOLS)
   target_compile_definitions(unwind_static_objects PRIVATE _LIBUNWIND_HIDE_SYMBOLS)
 endif()
 
-  add_library(unwind_static STATIC)
-  target_link_libraries(unwind_static PUBLIC unwind_static_objects)
-  set_target_properties(unwind_static
-    PROPERTIES
-      EXCLUDE_FROM_ALL "$<IF:$<BOOL:${LIBUNWIND_ENABLE_STATIC}>,FALSE,TRUE>"
-      LINK_FLAGS "${LIBUNWIND_LINK_FLAGS}"
-      LINKER_LANGUAGE C
-      OUTPUT_NAME "${LIBUNWIND_STATIC_OUTPUT_NAME}"
-  )
+add_library(unwind_static STATIC)
+target_link_libraries(unwind_static PUBLIC unwind_static_objects)
+set_target_properties(unwind_static
+  PROPERTIES
+    EXCLUDE_FROM_ALL "$<IF:$<BOOL:${LIBUNWIND_ENABLE_STATIC}>,FALSE,TRUE>"
+    LINK_FLAGS "${LIBUNWIND_LINK_FLAGS}"
+    LINKER_LANGUAGE C
+    OUTPUT_NAME "${LIBUNWIND_STATIC_OUTPUT_NAME}"
+)
 
 if (LIBUNWIND_ENABLE_STATIC)
   list(APPEND LIBUNWIND_BUILD_TARGETS "unwind_static")

From 3f9998af4f79e95fe8be615df9d6b898008044b9 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli@nvidia.com>
Date: Thu, 10 Oct 2024 10:24:02 -0700
Subject: [PATCH 050/177] [NVPTX] Prefer prmt.b32 over bfi.b32 (#110766)

In [[NVPTX] Improve lowering of
v4i8](https://github.com/llvm/llvm-project/commit/cbafb6f2f5c99474164dcc725820cbbeb2e02e14)
@Artem-B add the ability to lower ISD::BUILD_VECTOR with bfi PTX
instructions. @Artem-B did this because:
([source](https://github.com/llvm/llvm-project/pull/67866#discussion_r1343066911))

> Under the hood byte extraction/insertion ends up as BFI/BFE
instructions, so we may as well do that in PTX, too.
https://godbolt.org/z/Tb3zWbj9b

However, the example that @Artem-B linked was targeting sm_52. On modern
architectures, ptxas uses prmt.b32.
[Example](https://godbolt.org/z/Ye4W1n84o).

Thus, remove uses of NVPTXISD::BFI in favor of NVPTXISD::PRMT.
---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp  |  31 +-
 llvm/test/CodeGen/NVPTX/i8x4-instructions.ll | 614 ++++++++++---------
 llvm/test/CodeGen/NVPTX/sext-setcc.ll        |  18 +-
 3 files changed, 335 insertions(+), 328 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 57bc5fe0ac361c..d95f8f214be557 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -2332,20 +2332,23 @@ SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
     // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us
     // to optimize calculation of constant parts.
     if (VT == MVT::v4i8) {
-      SDValue C8 = DAG.getConstant(8, DL, MVT::i32);
-      SDValue E01 = DAG.getNode(
-          NVPTXISD::BFI, DL, MVT::i32,
-          DAG.getAnyExtOrTrunc(Op->getOperand(1), DL, MVT::i32),
-          DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32), C8, C8);
-      SDValue E012 =
-          DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
-                      DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32),
-                      E01, DAG.getConstant(16, DL, MVT::i32), C8);
-      SDValue E0123 =
-          DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
-                      DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32),
-                      E012, DAG.getConstant(24, DL, MVT::i32), C8);
-      return DAG.getNode(ISD::BITCAST, DL, VT, E0123);
+      SDValue PRMT__10 = DAG.getNode(
+          NVPTXISD::PRMT, DL, MVT::v4i8,
+          {DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32),
+           DAG.getAnyExtOrTrunc(Op->getOperand(1), DL, MVT::i32),
+           DAG.getConstant(0x3340, DL, MVT::i32),
+           DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)});
+      SDValue PRMT32__ = DAG.getNode(
+          NVPTXISD::PRMT, DL, MVT::v4i8,
+          {DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32),
+           DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32),
+           DAG.getConstant(0x4033, DL, MVT::i32),
+           DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)});
+      SDValue PRMT3210 = DAG.getNode(
+          NVPTXISD::PRMT, DL, MVT::v4i8,
+          {PRMT__10, PRMT32__, DAG.getConstant(0x5410, DL, MVT::i32),
+           DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)});
+      return DAG.getNode(ISD::BITCAST, DL, VT, PRMT3210);
     }
     return Op;
   }
diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
index 96a4359d0ec43e..84dde539ce4c47 100644
--- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
@@ -101,38 +101,38 @@ define <4 x i8> @test_add(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-LABEL: test_add(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<13>;
-; CHECK-NEXT:    .reg .b32 %r<19>;
+; CHECK-NEXT:    .reg .b32 %r<18>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_add_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_add_param_0];
-; CHECK-NEXT:    bfe.u32 %r3, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs2, %r4;
 ; CHECK-NEXT:    add.s16 %rs3, %rs2, %rs1;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs3;
-; CHECK-NEXT:    bfe.u32 %r6, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r2, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs4, %r6;
-; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r1, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
 ; CHECK-NEXT:    add.s16 %rs6, %rs5, %rs4;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT:    bfi.b32 %r9, %r8, %r5, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r10, %r2, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r9, %r8, %r5, 16435;
+; CHECK-NEXT:    bfe.u32 %r10, %r2, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
-; CHECK-NEXT:    bfe.u32 %r11, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r11, %r1, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs8, %r11;
 ; CHECK-NEXT:    add.s16 %rs9, %rs8, %rs7;
 ; CHECK-NEXT:    cvt.u32.u16 %r12, %rs9;
-; CHECK-NEXT:    bfi.b32 %r13, %r12, %r9, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r14, %r2, 24, 8;
-; CHECK-NEXT:    cvt.u16.u32 %rs10, %r14;
-; CHECK-NEXT:    bfe.u32 %r15, %r1, 24, 8;
-; CHECK-NEXT:    cvt.u16.u32 %rs11, %r15;
+; CHECK-NEXT:    bfe.u32 %r13, %r2, 0, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs10, %r13;
+; CHECK-NEXT:    bfe.u32 %r14, %r1, 0, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs11, %r14;
 ; CHECK-NEXT:    add.s16 %rs12, %rs11, %rs10;
-; CHECK-NEXT:    cvt.u32.u16 %r16, %rs12;
-; CHECK-NEXT:    bfi.b32 %r17, %r16, %r13, 24, 8;
+; CHECK-NEXT:    cvt.u32.u16 %r15, %rs12;
+; CHECK-NEXT:    prmt.b32 %r16, %r15, %r12, 13120;
+; CHECK-NEXT:    prmt.b32 %r17, %r16, %r9, 21520;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r17;
 ; CHECK-NEXT:    ret;
   %r = add <4 x i8> %a, %b
@@ -143,29 +143,29 @@ define <4 x i8> @test_add_imm_0(<4 x i8> %a) #0 {
 ; CHECK-LABEL: test_add_imm_0(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<9>;
-; CHECK-NEXT:    .reg .b32 %r<14>;
+; CHECK-NEXT:    .reg .b32 %r<13>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_add_imm_0_param_0];
-; CHECK-NEXT:    bfe.u32 %r2, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
-; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 4;
 ; CHECK-NEXT:    cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
-; CHECK-NEXT:    add.s16 %rs4, %rs3, 2;
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 3;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs4;
-; CHECK-NEXT:    bfi.b32 %r6, %r5, %r3, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r7, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r6, %r5, %r3, 16435;
+; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
-; CHECK-NEXT:    add.s16 %rs6, %rs5, 3;
+; CHECK-NEXT:    add.s16 %rs6, %rs5, 2;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT:    bfi.b32 %r9, %r8, %r6, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r10, %r1, 24, 8;
-; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
-; CHECK-NEXT:    add.s16 %rs8, %rs7, 4;
-; CHECK-NEXT:    cvt.u32.u16 %r11, %rs8;
-; CHECK-NEXT:    bfi.b32 %r12, %r11, %r9, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r9, %r1, 0, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs7, %r9;
+; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT:    cvt.u32.u16 %r10, %rs8;
+; CHECK-NEXT:    prmt.b32 %r11, %r10, %r8, 13120;
+; CHECK-NEXT:    prmt.b32 %r12, %r11, %r6, 21520;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r12;
 ; CHECK-NEXT:    ret;
   %r = add <4 x i8> <i8 1, i8 2, i8 3, i8 4>, %a
@@ -176,29 +176,29 @@ define <4 x i8> @test_add_imm_1(<4 x i8> %a) #0 {
 ; CHECK-LABEL: test_add_imm_1(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<9>;
-; CHECK-NEXT:    .reg .b32 %r<14>;
+; CHECK-NEXT:    .reg .b32 %r<13>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_add_imm_1_param_0];
-; CHECK-NEXT:    bfe.u32 %r2, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
-; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 4;
 ; CHECK-NEXT:    cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
-; CHECK-NEXT:    add.s16 %rs4, %rs3, 2;
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 3;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs4;
-; CHECK-NEXT:    bfi.b32 %r6, %r5, %r3, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r7, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r6, %r5, %r3, 16435;
+; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
-; CHECK-NEXT:    add.s16 %rs6, %rs5, 3;
+; CHECK-NEXT:    add.s16 %rs6, %rs5, 2;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT:    bfi.b32 %r9, %r8, %r6, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r10, %r1, 24, 8;
-; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
-; CHECK-NEXT:    add.s16 %rs8, %rs7, 4;
-; CHECK-NEXT:    cvt.u32.u16 %r11, %rs8;
-; CHECK-NEXT:    bfi.b32 %r12, %r11, %r9, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r9, %r1, 0, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs7, %r9;
+; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
+; CHECK-NEXT:    cvt.u32.u16 %r10, %rs8;
+; CHECK-NEXT:    prmt.b32 %r11, %r10, %r8, 13120;
+; CHECK-NEXT:    prmt.b32 %r12, %r11, %r6, 21520;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r12;
 ; CHECK-NEXT:    ret;
   %r = add <4 x i8> %a, <i8 1, i8 2, i8 3, i8 4>
@@ -209,38 +209,38 @@ define <4 x i8> @test_sub(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-LABEL: test_sub(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<13>;
-; CHECK-NEXT:    .reg .b32 %r<19>;
+; CHECK-NEXT:    .reg .b32 %r<18>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_sub_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_sub_param_0];
-; CHECK-NEXT:    bfe.u32 %r3, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs2, %r4;
 ; CHECK-NEXT:    sub.s16 %rs3, %rs2, %rs1;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs3;
-; CHECK-NEXT:    bfe.u32 %r6, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r2, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs4, %r6;
-; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r1, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
 ; CHECK-NEXT:    sub.s16 %rs6, %rs5, %rs4;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT:    bfi.b32 %r9, %r8, %r5, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r10, %r2, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r9, %r8, %r5, 16435;
+; CHECK-NEXT:    bfe.u32 %r10, %r2, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
-; CHECK-NEXT:    bfe.u32 %r11, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r11, %r1, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs8, %r11;
 ; CHECK-NEXT:    sub.s16 %rs9, %rs8, %rs7;
 ; CHECK-NEXT:    cvt.u32.u16 %r12, %rs9;
-; CHECK-NEXT:    bfi.b32 %r13, %r12, %r9, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r14, %r2, 24, 8;
-; CHECK-NEXT:    cvt.u16.u32 %rs10, %r14;
-; CHECK-NEXT:    bfe.u32 %r15, %r1, 24, 8;
-; CHECK-NEXT:    cvt.u16.u32 %rs11, %r15;
+; CHECK-NEXT:    bfe.u32 %r13, %r2, 0, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs10, %r13;
+; CHECK-NEXT:    bfe.u32 %r14, %r1, 0, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs11, %r14;
 ; CHECK-NEXT:    sub.s16 %rs12, %rs11, %rs10;
-; CHECK-NEXT:    cvt.u32.u16 %r16, %rs12;
-; CHECK-NEXT:    bfi.b32 %r17, %r16, %r13, 24, 8;
+; CHECK-NEXT:    cvt.u32.u16 %r15, %rs12;
+; CHECK-NEXT:    prmt.b32 %r16, %r15, %r12, 13120;
+; CHECK-NEXT:    prmt.b32 %r17, %r16, %r9, 21520;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r17;
 ; CHECK-NEXT:    ret;
   %r = sub <4 x i8> %a, %b
@@ -251,38 +251,38 @@ define <4 x i8> @test_smax(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-LABEL: test_smax(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<27>;
+; CHECK-NEXT:    .reg .b32 %r<26>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_smax_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_smax_param_0];
-; CHECK-NEXT:    bfe.s32 %r3, %r2, 24, 8;
-; CHECK-NEXT:    bfe.s32 %r4, %r1, 24, 8;
+; CHECK-NEXT:    bfe.s32 %r3, %r2, 0, 8;
+; CHECK-NEXT:    bfe.s32 %r4, %r1, 0, 8;
 ; CHECK-NEXT:    setp.gt.s32 %p1, %r4, %r3;
-; CHECK-NEXT:    bfe.s32 %r5, %r2, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r6, %r1, 16, 8;
+; CHECK-NEXT:    bfe.s32 %r5, %r2, 8, 8;
+; CHECK-NEXT:    bfe.s32 %r6, %r1, 8, 8;
 ; CHECK-NEXT:    setp.gt.s32 %p2, %r6, %r5;
-; CHECK-NEXT:    bfe.s32 %r7, %r2, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r8, %r1, 8, 8;
+; CHECK-NEXT:    bfe.s32 %r7, %r2, 16, 8;
+; CHECK-NEXT:    bfe.s32 %r8, %r1, 16, 8;
 ; CHECK-NEXT:    setp.gt.s32 %p3, %r8, %r7;
-; CHECK-NEXT:    bfe.s32 %r9, %r2, 0, 8;
-; CHECK-NEXT:    bfe.s32 %r10, %r1, 0, 8;
+; CHECK-NEXT:    bfe.s32 %r9, %r2, 24, 8;
+; CHECK-NEXT:    bfe.s32 %r10, %r1, 24, 8;
 ; CHECK-NEXT:    setp.gt.s32 %p4, %r10, %r9;
-; CHECK-NEXT:    bfe.u32 %r11, %r1, 24, 8;
-; CHECK-NEXT:    bfe.u32 %r12, %r1, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r13, %r1, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r14, %r1, 0, 8;
-; CHECK-NEXT:    bfe.u32 %r15, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r11, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r13, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r15, %r2, 24, 8;
 ; CHECK-NEXT:    selp.b32 %r16, %r14, %r15, %p4;
-; CHECK-NEXT:    bfe.u32 %r17, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r17, %r2, 16, 8;
 ; CHECK-NEXT:    selp.b32 %r18, %r13, %r17, %p3;
-; CHECK-NEXT:    bfi.b32 %r19, %r18, %r16, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r20, %r2, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r19, %r18, %r16, 16435;
+; CHECK-NEXT:    bfe.u32 %r20, %r2, 8, 8;
 ; CHECK-NEXT:    selp.b32 %r21, %r12, %r20, %p2;
-; CHECK-NEXT:    bfi.b32 %r22, %r21, %r19, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r23, %r2, 24, 8;
-; CHECK-NEXT:    selp.b32 %r24, %r11, %r23, %p1;
-; CHECK-NEXT:    bfi.b32 %r25, %r24, %r22, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r22, %r2, 0, 8;
+; CHECK-NEXT:    selp.b32 %r23, %r11, %r22, %p1;
+; CHECK-NEXT:    prmt.b32 %r24, %r23, %r21, 13120;
+; CHECK-NEXT:    prmt.b32 %r25, %r24, %r19, 21520;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r25;
 ; CHECK-NEXT:    ret;
   %cmp = icmp sgt <4 x i8> %a, %b
@@ -294,30 +294,30 @@ define <4 x i8> @test_umax(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-LABEL: test_umax(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<19>;
+; CHECK-NEXT:    .reg .b32 %r<18>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_umax_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_umax_param_0];
-; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r3, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 0, 8;
 ; CHECK-NEXT:    setp.hi.u32 %p1, %r4, %r3;
-; CHECK-NEXT:    bfe.u32 %r5, %r2, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r6, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r5, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r1, 8, 8;
 ; CHECK-NEXT:    setp.hi.u32 %p2, %r6, %r5;
-; CHECK-NEXT:    bfe.u32 %r7, %r2, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r8, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r8, %r1, 16, 8;
 ; CHECK-NEXT:    setp.hi.u32 %p3, %r8, %r7;
-; CHECK-NEXT:    bfe.u32 %r9, %r2, 0, 8;
-; CHECK-NEXT:    bfe.u32 %r10, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r9, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r1, 24, 8;
 ; CHECK-NEXT:    setp.hi.u32 %p4, %r10, %r9;
 ; CHECK-NEXT:    selp.b32 %r11, %r10, %r9, %p4;
 ; CHECK-NEXT:    selp.b32 %r12, %r8, %r7, %p3;
-; CHECK-NEXT:    bfi.b32 %r13, %r12, %r11, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r13, %r12, %r11, 16435;
 ; CHECK-NEXT:    selp.b32 %r14, %r6, %r5, %p2;
-; CHECK-NEXT:    bfi.b32 %r15, %r14, %r13, 16, 8;
-; CHECK-NEXT:    selp.b32 %r16, %r4, %r3, %p1;
-; CHECK-NEXT:    bfi.b32 %r17, %r16, %r15, 24, 8;
+; CHECK-NEXT:    selp.b32 %r15, %r4, %r3, %p1;
+; CHECK-NEXT:    prmt.b32 %r16, %r15, %r14, 13120;
+; CHECK-NEXT:    prmt.b32 %r17, %r16, %r13, 21520;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r17;
 ; CHECK-NEXT:    ret;
   %cmp = icmp ugt <4 x i8> %a, %b
@@ -329,38 +329,38 @@ define <4 x i8> @test_smin(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-LABEL: test_smin(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<27>;
+; CHECK-NEXT:    .reg .b32 %r<26>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_smin_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_smin_param_0];
-; CHECK-NEXT:    bfe.s32 %r3, %r2, 24, 8;
-; CHECK-NEXT:    bfe.s32 %r4, %r1, 24, 8;
+; CHECK-NEXT:    bfe.s32 %r3, %r2, 0, 8;
+; CHECK-NEXT:    bfe.s32 %r4, %r1, 0, 8;
 ; CHECK-NEXT:    setp.le.s32 %p1, %r4, %r3;
-; CHECK-NEXT:    bfe.s32 %r5, %r2, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r6, %r1, 16, 8;
+; CHECK-NEXT:    bfe.s32 %r5, %r2, 8, 8;
+; CHECK-NEXT:    bfe.s32 %r6, %r1, 8, 8;
 ; CHECK-NEXT:    setp.le.s32 %p2, %r6, %r5;
-; CHECK-NEXT:    bfe.s32 %r7, %r2, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r8, %r1, 8, 8;
+; CHECK-NEXT:    bfe.s32 %r7, %r2, 16, 8;
+; CHECK-NEXT:    bfe.s32 %r8, %r1, 16, 8;
 ; CHECK-NEXT:    setp.le.s32 %p3, %r8, %r7;
-; CHECK-NEXT:    bfe.s32 %r9, %r2, 0, 8;
-; CHECK-NEXT:    bfe.s32 %r10, %r1, 0, 8;
+; CHECK-NEXT:    bfe.s32 %r9, %r2, 24, 8;
+; CHECK-NEXT:    bfe.s32 %r10, %r1, 24, 8;
 ; CHECK-NEXT:    setp.le.s32 %p4, %r10, %r9;
-; CHECK-NEXT:    bfe.u32 %r11, %r1, 24, 8;
-; CHECK-NEXT:    bfe.u32 %r12, %r1, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r13, %r1, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r14, %r1, 0, 8;
-; CHECK-NEXT:    bfe.u32 %r15, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r11, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r13, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r15, %r2, 24, 8;
 ; CHECK-NEXT:    selp.b32 %r16, %r14, %r15, %p4;
-; CHECK-NEXT:    bfe.u32 %r17, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r17, %r2, 16, 8;
 ; CHECK-NEXT:    selp.b32 %r18, %r13, %r17, %p3;
-; CHECK-NEXT:    bfi.b32 %r19, %r18, %r16, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r20, %r2, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r19, %r18, %r16, 16435;
+; CHECK-NEXT:    bfe.u32 %r20, %r2, 8, 8;
 ; CHECK-NEXT:    selp.b32 %r21, %r12, %r20, %p2;
-; CHECK-NEXT:    bfi.b32 %r22, %r21, %r19, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r23, %r2, 24, 8;
-; CHECK-NEXT:    selp.b32 %r24, %r11, %r23, %p1;
-; CHECK-NEXT:    bfi.b32 %r25, %r24, %r22, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r22, %r2, 0, 8;
+; CHECK-NEXT:    selp.b32 %r23, %r11, %r22, %p1;
+; CHECK-NEXT:    prmt.b32 %r24, %r23, %r21, 13120;
+; CHECK-NEXT:    prmt.b32 %r25, %r24, %r19, 21520;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r25;
 ; CHECK-NEXT:    ret;
   %cmp = icmp sle <4 x i8> %a, %b
@@ -372,30 +372,30 @@ define <4 x i8> @test_umin(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-LABEL: test_umin(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<19>;
+; CHECK-NEXT:    .reg .b32 %r<18>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_umin_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_umin_param_0];
-; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r3, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 0, 8;
 ; CHECK-NEXT:    setp.ls.u32 %p1, %r4, %r3;
-; CHECK-NEXT:    bfe.u32 %r5, %r2, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r6, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r5, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r1, 8, 8;
 ; CHECK-NEXT:    setp.ls.u32 %p2, %r6, %r5;
-; CHECK-NEXT:    bfe.u32 %r7, %r2, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r8, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r8, %r1, 16, 8;
 ; CHECK-NEXT:    setp.ls.u32 %p3, %r8, %r7;
-; CHECK-NEXT:    bfe.u32 %r9, %r2, 0, 8;
-; CHECK-NEXT:    bfe.u32 %r10, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r9, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r1, 24, 8;
 ; CHECK-NEXT:    setp.ls.u32 %p4, %r10, %r9;
 ; CHECK-NEXT:    selp.b32 %r11, %r10, %r9, %p4;
 ; CHECK-NEXT:    selp.b32 %r12, %r8, %r7, %p3;
-; CHECK-NEXT:    bfi.b32 %r13, %r12, %r11, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r13, %r12, %r11, 16435;
 ; CHECK-NEXT:    selp.b32 %r14, %r6, %r5, %p2;
-; CHECK-NEXT:    bfi.b32 %r15, %r14, %r13, 16, 8;
-; CHECK-NEXT:    selp.b32 %r16, %r4, %r3, %p1;
-; CHECK-NEXT:    bfi.b32 %r17, %r16, %r15, 24, 8;
+; CHECK-NEXT:    selp.b32 %r15, %r4, %r3, %p1;
+; CHECK-NEXT:    prmt.b32 %r16, %r15, %r14, 13120;
+; CHECK-NEXT:    prmt.b32 %r17, %r16, %r13, 21520;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r17;
 ; CHECK-NEXT:    ret;
   %cmp = icmp ule <4 x i8> %a, %b
@@ -407,35 +407,35 @@ define <4 x i8> @test_eq(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 {
 ; CHECK-LABEL: test_eq(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<24>;
+; CHECK-NEXT:    .reg .b32 %r<23>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r3, [test_eq_param_2];
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_eq_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_eq_param_0];
-; CHECK-NEXT:    bfe.u32 %r4, %r2, 24, 8;
-; CHECK-NEXT:    bfe.u32 %r5, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r5, %r1, 0, 8;
 ; CHECK-NEXT:    setp.eq.u32 %p1, %r5, %r4;
-; CHECK-NEXT:    bfe.u32 %r6, %r2, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r7, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
 ; CHECK-NEXT:    setp.eq.u32 %p2, %r7, %r6;
-; CHECK-NEXT:    bfe.u32 %r8, %r2, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r9, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r8, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r9, %r1, 16, 8;
 ; CHECK-NEXT:    setp.eq.u32 %p3, %r9, %r8;
-; CHECK-NEXT:    bfe.u32 %r10, %r2, 0, 8;
-; CHECK-NEXT:    bfe.u32 %r11, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r11, %r1, 24, 8;
 ; CHECK-NEXT:    setp.eq.u32 %p4, %r11, %r10;
-; CHECK-NEXT:    bfe.u32 %r12, %r3, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r3, 24, 8;
 ; CHECK-NEXT:    selp.b32 %r13, %r11, %r12, %p4;
-; CHECK-NEXT:    bfe.u32 %r14, %r3, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r3, 16, 8;
 ; CHECK-NEXT:    selp.b32 %r15, %r9, %r14, %p3;
-; CHECK-NEXT:    bfi.b32 %r16, %r15, %r13, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r17, %r3, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r16, %r15, %r13, 16435;
+; CHECK-NEXT:    bfe.u32 %r17, %r3, 8, 8;
 ; CHECK-NEXT:    selp.b32 %r18, %r7, %r17, %p2;
-; CHECK-NEXT:    bfi.b32 %r19, %r18, %r16, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r20, %r3, 24, 8;
-; CHECK-NEXT:    selp.b32 %r21, %r5, %r20, %p1;
-; CHECK-NEXT:    bfi.b32 %r22, %r21, %r19, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r19, %r3, 0, 8;
+; CHECK-NEXT:    selp.b32 %r20, %r5, %r19, %p1;
+; CHECK-NEXT:    prmt.b32 %r21, %r20, %r18, 13120;
+; CHECK-NEXT:    prmt.b32 %r22, %r21, %r16, 21520;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r22;
 ; CHECK-NEXT:    ret;
   %cmp = icmp eq <4 x i8> %a, %b
@@ -447,35 +447,35 @@ define <4 x i8> @test_ne(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 {
 ; CHECK-LABEL: test_ne(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<24>;
+; CHECK-NEXT:    .reg .b32 %r<23>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r3, [test_ne_param_2];
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_ne_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_ne_param_0];
-; CHECK-NEXT:    bfe.u32 %r4, %r2, 24, 8;
-; CHECK-NEXT:    bfe.u32 %r5, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r5, %r1, 0, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p1, %r5, %r4;
-; CHECK-NEXT:    bfe.u32 %r6, %r2, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r7, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p2, %r7, %r6;
-; CHECK-NEXT:    bfe.u32 %r8, %r2, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r9, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r8, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r9, %r1, 16, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p3, %r9, %r8;
-; CHECK-NEXT:    bfe.u32 %r10, %r2, 0, 8;
-; CHECK-NEXT:    bfe.u32 %r11, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r11, %r1, 24, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p4, %r11, %r10;
-; CHECK-NEXT:    bfe.u32 %r12, %r3, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r3, 24, 8;
 ; CHECK-NEXT:    selp.b32 %r13, %r11, %r12, %p4;
-; CHECK-NEXT:    bfe.u32 %r14, %r3, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r3, 16, 8;
 ; CHECK-NEXT:    selp.b32 %r15, %r9, %r14, %p3;
-; CHECK-NEXT:    bfi.b32 %r16, %r15, %r13, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r17, %r3, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r16, %r15, %r13, 16435;
+; CHECK-NEXT:    bfe.u32 %r17, %r3, 8, 8;
 ; CHECK-NEXT:    selp.b32 %r18, %r7, %r17, %p2;
-; CHECK-NEXT:    bfi.b32 %r19, %r18, %r16, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r20, %r3, 24, 8;
-; CHECK-NEXT:    selp.b32 %r21, %r5, %r20, %p1;
-; CHECK-NEXT:    bfi.b32 %r22, %r21, %r19, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r19, %r3, 0, 8;
+; CHECK-NEXT:    selp.b32 %r20, %r5, %r19, %p1;
+; CHECK-NEXT:    prmt.b32 %r21, %r20, %r18, 13120;
+; CHECK-NEXT:    prmt.b32 %r22, %r21, %r16, 21520;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r22;
 ; CHECK-NEXT:    ret;
   %cmp = icmp ne <4 x i8> %a, %b
@@ -487,38 +487,38 @@ define <4 x i8> @test_mul(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-LABEL: test_mul(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<13>;
-; CHECK-NEXT:    .reg .b32 %r<19>;
+; CHECK-NEXT:    .reg .b32 %r<18>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_mul_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_mul_param_0];
-; CHECK-NEXT:    bfe.u32 %r3, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs2, %r4;
 ; CHECK-NEXT:    mul.lo.s16 %rs3, %rs2, %rs1;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs3;
-; CHECK-NEXT:    bfe.u32 %r6, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r2, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs4, %r6;
-; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r1, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
 ; CHECK-NEXT:    mul.lo.s16 %rs6, %rs5, %rs4;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT:    bfi.b32 %r9, %r8, %r5, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r10, %r2, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r9, %r8, %r5, 16435;
+; CHECK-NEXT:    bfe.u32 %r10, %r2, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
-; CHECK-NEXT:    bfe.u32 %r11, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r11, %r1, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs8, %r11;
 ; CHECK-NEXT:    mul.lo.s16 %rs9, %rs8, %rs7;
 ; CHECK-NEXT:    cvt.u32.u16 %r12, %rs9;
-; CHECK-NEXT:    bfi.b32 %r13, %r12, %r9, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r14, %r2, 24, 8;
-; CHECK-NEXT:    cvt.u16.u32 %rs10, %r14;
-; CHECK-NEXT:    bfe.u32 %r15, %r1, 24, 8;
-; CHECK-NEXT:    cvt.u16.u32 %rs11, %r15;
+; CHECK-NEXT:    bfe.u32 %r13, %r2, 0, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs10, %r13;
+; CHECK-NEXT:    bfe.u32 %r14, %r1, 0, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs11, %r14;
 ; CHECK-NEXT:    mul.lo.s16 %rs12, %rs11, %rs10;
-; CHECK-NEXT:    cvt.u32.u16 %r16, %rs12;
-; CHECK-NEXT:    bfi.b32 %r17, %r16, %r13, 24, 8;
+; CHECK-NEXT:    cvt.u32.u16 %r15, %rs12;
+; CHECK-NEXT:    prmt.b32 %r16, %r15, %r12, 13120;
+; CHECK-NEXT:    prmt.b32 %r17, %r16, %r9, 21520;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r17;
 ; CHECK-NEXT:    ret;
   %r = mul <4 x i8> %a, %b
@@ -548,12 +548,13 @@ define <4 x i8> @test_or_computed(i8 %a) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u8 %rs1, [test_or_computed_param_0];
-; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
-; CHECK-NEXT:    bfi.b32 %r2, 0, %r1, 8, 8;
-; CHECK-NEXT:    bfi.b32 %r3, 0, %r2, 16, 8;
-; CHECK-NEXT:    bfi.b32 %r4, 0, %r3, 24, 8;
-; CHECK-NEXT:    bfi.b32 %r6, 5, %r4, 8, 8;
-; CHECK-NEXT:    or.b32 %r8, %r6, %r4;
+; CHECK-NEXT:    mov.b32 %r1, 0;
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 16435;
+; CHECK-NEXT:    cvt.u32.u16 %r3, %rs1;
+; CHECK-NEXT:    prmt.b32 %r4, %r3, 0, 13120;
+; CHECK-NEXT:    prmt.b32 %r5, %r4, %r2, 21520;
+; CHECK-NEXT:    bfi.b32 %r6, 5, %r5, 8, 8;
+; CHECK-NEXT:    or.b32 %r8, %r6, %r5;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r8;
 ; CHECK-NEXT:    ret;
   %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0
@@ -613,12 +614,13 @@ define <4 x i8> @test_xor_computed(i8 %a) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u8 %rs1, [test_xor_computed_param_0];
-; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
-; CHECK-NEXT:    bfi.b32 %r2, 0, %r1, 8, 8;
-; CHECK-NEXT:    bfi.b32 %r3, 0, %r2, 16, 8;
-; CHECK-NEXT:    bfi.b32 %r4, 0, %r3, 24, 8;
-; CHECK-NEXT:    bfi.b32 %r6, 5, %r4, 8, 8;
-; CHECK-NEXT:    xor.b32 %r8, %r6, %r4;
+; CHECK-NEXT:    mov.b32 %r1, 0;
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 16435;
+; CHECK-NEXT:    cvt.u32.u16 %r3, %rs1;
+; CHECK-NEXT:    prmt.b32 %r4, %r3, 0, 13120;
+; CHECK-NEXT:    prmt.b32 %r5, %r4, %r2, 21520;
+; CHECK-NEXT:    bfi.b32 %r6, 5, %r5, 8, 8;
+; CHECK-NEXT:    xor.b32 %r8, %r6, %r5;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r8;
 ; CHECK-NEXT:    ret;
   %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0
@@ -678,12 +680,13 @@ define <4 x i8> @test_and_computed(i8 %a) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u8 %rs1, [test_and_computed_param_0];
-; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
-; CHECK-NEXT:    bfi.b32 %r2, 0, %r1, 8, 8;
-; CHECK-NEXT:    bfi.b32 %r3, 0, %r2, 16, 8;
-; CHECK-NEXT:    bfi.b32 %r4, 0, %r3, 24, 8;
-; CHECK-NEXT:    bfi.b32 %r6, 5, %r4, 8, 8;
-; CHECK-NEXT:    and.b32 %r8, %r6, %r4;
+; CHECK-NEXT:    mov.b32 %r1, 0;
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 16435;
+; CHECK-NEXT:    cvt.u32.u16 %r3, %rs1;
+; CHECK-NEXT:    prmt.b32 %r4, %r3, 0, 13120;
+; CHECK-NEXT:    prmt.b32 %r5, %r4, %r2, 21520;
+; CHECK-NEXT:    bfi.b32 %r6, 5, %r5, 8, 8;
+; CHECK-NEXT:    and.b32 %r8, %r6, %r5;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r8;
 ; CHECK-NEXT:    ret;
   %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0
@@ -926,40 +929,40 @@ define <4 x i8> @test_select_cc(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8>
 ; CHECK-LABEL: test_select_cc(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<29>;
+; CHECK-NEXT:    .reg .b32 %r<28>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r4, [test_select_cc_param_3];
 ; CHECK-NEXT:    ld.param.u32 %r3, [test_select_cc_param_2];
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_select_cc_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_select_cc_param_0];
-; CHECK-NEXT:    bfe.u32 %r5, %r4, 24, 8;
-; CHECK-NEXT:    bfe.u32 %r6, %r3, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r5, %r4, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r3, 0, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p1, %r6, %r5;
-; CHECK-NEXT:    bfe.u32 %r7, %r4, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r8, %r3, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r4, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r8, %r3, 8, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p2, %r8, %r7;
-; CHECK-NEXT:    bfe.u32 %r9, %r4, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r10, %r3, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r9, %r4, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r3, 16, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p3, %r10, %r9;
-; CHECK-NEXT:    bfe.u32 %r11, %r4, 0, 8;
-; CHECK-NEXT:    bfe.u32 %r12, %r3, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r11, %r4, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r3, 24, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p4, %r12, %r11;
-; CHECK-NEXT:    bfe.u32 %r13, %r2, 0, 8;
-; CHECK-NEXT:    bfe.u32 %r14, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r13, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r1, 24, 8;
 ; CHECK-NEXT:    selp.b32 %r15, %r14, %r13, %p4;
-; CHECK-NEXT:    bfe.u32 %r16, %r2, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r17, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r16, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r17, %r1, 16, 8;
 ; CHECK-NEXT:    selp.b32 %r18, %r17, %r16, %p3;
-; CHECK-NEXT:    bfi.b32 %r19, %r18, %r15, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r20, %r2, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r21, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r19, %r18, %r15, 16435;
+; CHECK-NEXT:    bfe.u32 %r20, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r21, %r1, 8, 8;
 ; CHECK-NEXT:    selp.b32 %r22, %r21, %r20, %p2;
-; CHECK-NEXT:    bfi.b32 %r23, %r22, %r19, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r24, %r2, 24, 8;
-; CHECK-NEXT:    bfe.u32 %r25, %r1, 24, 8;
-; CHECK-NEXT:    selp.b32 %r26, %r25, %r24, %p1;
-; CHECK-NEXT:    bfi.b32 %r27, %r26, %r23, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r23, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r24, %r1, 0, 8;
+; CHECK-NEXT:    selp.b32 %r25, %r24, %r23, %p1;
+; CHECK-NEXT:    prmt.b32 %r26, %r25, %r22, 13120;
+; CHECK-NEXT:    prmt.b32 %r27, %r26, %r19, 21520;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r27;
 ; CHECK-NEXT:    ret;
   %cc = icmp ne <4 x i8> %c, %d
@@ -1006,32 +1009,32 @@ define <4 x i8> @test_select_cc_i8_i32(<4 x i8> %a, <4 x i8> %b,
 ; CHECK-LABEL: test_select_cc_i8_i32(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<27>;
+; CHECK-NEXT:    .reg .b32 %r<26>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v4.u32 {%r7, %r8, %r9, %r10}, [test_select_cc_i8_i32_param_3];
 ; CHECK-NEXT:    ld.param.v4.u32 {%r3, %r4, %r5, %r6}, [test_select_cc_i8_i32_param_2];
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_select_cc_i8_i32_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_select_cc_i8_i32_param_0];
-; CHECK-NEXT:    setp.ne.s32 %p1, %r6, %r10;
-; CHECK-NEXT:    setp.ne.s32 %p2, %r5, %r9;
-; CHECK-NEXT:    setp.ne.s32 %p3, %r4, %r8;
-; CHECK-NEXT:    setp.ne.s32 %p4, %r3, %r7;
-; CHECK-NEXT:    bfe.u32 %r11, %r2, 0, 8;
-; CHECK-NEXT:    bfe.u32 %r12, %r1, 0, 8;
+; CHECK-NEXT:    setp.ne.s32 %p1, %r3, %r7;
+; CHECK-NEXT:    setp.ne.s32 %p2, %r4, %r8;
+; CHECK-NEXT:    setp.ne.s32 %p3, %r5, %r9;
+; CHECK-NEXT:    setp.ne.s32 %p4, %r6, %r10;
+; CHECK-NEXT:    bfe.u32 %r11, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r1, 24, 8;
 ; CHECK-NEXT:    selp.b32 %r13, %r12, %r11, %p4;
-; CHECK-NEXT:    bfe.u32 %r14, %r2, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r15, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r15, %r1, 16, 8;
 ; CHECK-NEXT:    selp.b32 %r16, %r15, %r14, %p3;
-; CHECK-NEXT:    bfi.b32 %r17, %r16, %r13, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r18, %r2, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r19, %r1, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r17, %r16, %r13, 16435;
+; CHECK-NEXT:    bfe.u32 %r18, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r19, %r1, 8, 8;
 ; CHECK-NEXT:    selp.b32 %r20, %r19, %r18, %p2;
-; CHECK-NEXT:    bfi.b32 %r21, %r20, %r17, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r22, %r2, 24, 8;
-; CHECK-NEXT:    bfe.u32 %r23, %r1, 24, 8;
-; CHECK-NEXT:    selp.b32 %r24, %r23, %r22, %p1;
-; CHECK-NEXT:    bfi.b32 %r25, %r24, %r21, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r21, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r22, %r1, 0, 8;
+; CHECK-NEXT:    selp.b32 %r23, %r22, %r21, %p1;
+; CHECK-NEXT:    prmt.b32 %r24, %r23, %r20, 13120;
+; CHECK-NEXT:    prmt.b32 %r25, %r24, %r17, 21520;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r25;
 ; CHECK-NEXT:    ret;
                                           <4 x i32> %c, <4 x i32> %d) #0 {
@@ -1044,13 +1047,13 @@ define <4 x i8> @test_select_cc_i8_i32(<4 x i8> %a, <4 x i8> %b,
 define <4 x i8> @test_trunc_2xi32(<4 x i32> %a) #0 {
 ; CHECK-LABEL: test_trunc_2xi32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-NEXT:    .reg .b32 %r<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [test_trunc_2xi32_param_0];
-; CHECK-NEXT:    bfi.b32 %r5, %r2, %r1, 8, 8;
-; CHECK-NEXT:    bfi.b32 %r6, %r3, %r5, 16, 8;
-; CHECK-NEXT:    bfi.b32 %r7, %r4, %r6, 24, 8;
+; CHECK-NEXT:    prmt.b32 %r5, %r3, %r4, 16435;
+; CHECK-NEXT:    prmt.b32 %r6, %r1, %r2, 13120;
+; CHECK-NEXT:    prmt.b32 %r7, %r6, %r5, 21520;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r7;
 ; CHECK-NEXT:    ret;
   %r = trunc <4 x i32> %a to <4 x i8>
@@ -1060,19 +1063,19 @@ define <4 x i8> @test_trunc_2xi32(<4 x i32> %a) #0 {
 define <4 x i8> @test_trunc_2xi64(<4 x i64> %a) #0 {
 ; CHECK-LABEL: test_trunc_2xi64(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-NEXT:    .reg .b32 %r<8>;
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v2.u64 {%rd3, %rd4}, [test_trunc_2xi64_param_0+16];
 ; CHECK-NEXT:    ld.param.v2.u64 {%rd1, %rd2}, [test_trunc_2xi64_param_0];
-; CHECK-NEXT:    cvt.u32.u64 %r1, %rd1;
-; CHECK-NEXT:    cvt.u32.u64 %r2, %rd2;
-; CHECK-NEXT:    bfi.b32 %r3, %r2, %r1, 8, 8;
-; CHECK-NEXT:    cvt.u32.u64 %r4, %rd3;
-; CHECK-NEXT:    bfi.b32 %r5, %r4, %r3, 16, 8;
-; CHECK-NEXT:    cvt.u32.u64 %r6, %rd4;
-; CHECK-NEXT:    bfi.b32 %r7, %r6, %r5, 24, 8;
+; CHECK-NEXT:    cvt.u32.u64 %r1, %rd4;
+; CHECK-NEXT:    cvt.u32.u64 %r2, %rd3;
+; CHECK-NEXT:    prmt.b32 %r3, %r2, %r1, 16435;
+; CHECK-NEXT:    cvt.u32.u64 %r4, %rd2;
+; CHECK-NEXT:    cvt.u32.u64 %r5, %rd1;
+; CHECK-NEXT:    prmt.b32 %r6, %r5, %r4, 13120;
+; CHECK-NEXT:    prmt.b32 %r7, %r6, %r3, 21520;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r7;
 ; CHECK-NEXT:    ret;
   %r = trunc <4 x i64> %a to <4 x i8>
@@ -1184,15 +1187,16 @@ define <2 x half> @test_bitcast_4xi8_to_2xhalf(i8 %a) #0 {
 ; CHECK-LABEL: test_bitcast_4xi8_to_2xhalf(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b32 %r<6>;
+; CHECK-NEXT:    .reg .b32 %r<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u8 %rs1, [test_bitcast_4xi8_to_2xhalf_param_0];
-; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
-; CHECK-NEXT:    bfi.b32 %r2, 5, %r1, 8, 8;
-; CHECK-NEXT:    bfi.b32 %r3, 6, %r2, 16, 8;
-; CHECK-NEXT:    bfi.b32 %r4, 7, %r3, 24, 8;
-; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r4;
+; CHECK-NEXT:    mov.b32 %r1, 6;
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 7, 16435;
+; CHECK-NEXT:    cvt.u32.u16 %r3, %rs1;
+; CHECK-NEXT:    prmt.b32 %r4, %r3, 5, 13120;
+; CHECK-NEXT:    prmt.b32 %r5, %r4, %r2, 21520;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r5;
 ; CHECK-NEXT:    ret;
   %ins.0 = insertelement <4 x i8> undef, i8 %a, i32 0
   %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1
@@ -1255,27 +1259,27 @@ define <4 x i8> @test_fptosi_4xhalf_to_4xi8(<4 x half> %a) #0 {
 ; CHECK-LABEL: test_fptosi_4xhalf_to_4xi8(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<13>;
-; CHECK-NEXT:    .reg .b32 %r<15>;
+; CHECK-NEXT:    .reg .b32 %r<14>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v2.u32 {%r3, %r4}, [test_fptosi_4xhalf_to_4xi8_param_0];
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r3;
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
 ; CHECK-NEXT:    cvt.rzi.s16.f16 %rs3, %rs2;
 ; CHECK-NEXT:    cvt.rzi.s16.f16 %rs4, %rs1;
 ; CHECK-NEXT:    mov.b32 %r5, {%rs4, %rs3};
 ; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r5;
-; CHECK-NEXT:    cvt.u32.u16 %r6, %rs5;
-; CHECK-NEXT:    cvt.u32.u16 %r7, %rs6;
-; CHECK-NEXT:    bfi.b32 %r8, %r7, %r6, 8, 8;
-; CHECK-NEXT:    mov.b32 {%rs7, %rs8}, %r4;
+; CHECK-NEXT:    cvt.u32.u16 %r6, %rs6;
+; CHECK-NEXT:    cvt.u32.u16 %r7, %rs5;
+; CHECK-NEXT:    prmt.b32 %r8, %r7, %r6, 16435;
+; CHECK-NEXT:    mov.b32 {%rs7, %rs8}, %r3;
 ; CHECK-NEXT:    cvt.rzi.s16.f16 %rs9, %rs8;
 ; CHECK-NEXT:    cvt.rzi.s16.f16 %rs10, %rs7;
 ; CHECK-NEXT:    mov.b32 %r9, {%rs10, %rs9};
 ; CHECK-NEXT:    mov.b32 {%rs11, %rs12}, %r9;
-; CHECK-NEXT:    cvt.u32.u16 %r10, %rs11;
-; CHECK-NEXT:    bfi.b32 %r11, %r10, %r8, 16, 8;
-; CHECK-NEXT:    cvt.u32.u16 %r12, %rs12;
-; CHECK-NEXT:    bfi.b32 %r13, %r12, %r11, 24, 8;
+; CHECK-NEXT:    cvt.u32.u16 %r10, %rs12;
+; CHECK-NEXT:    cvt.u32.u16 %r11, %rs11;
+; CHECK-NEXT:    prmt.b32 %r12, %r11, %r10, 13120;
+; CHECK-NEXT:    prmt.b32 %r13, %r12, %r8, 21520;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r13;
 ; CHECK-NEXT:    ret;
   %r = fptosi <4 x half> %a to <4 x i8>
@@ -1286,27 +1290,27 @@ define <4 x i8> @test_fptoui_4xhalf_to_4xi8(<4 x half> %a) #0 {
 ; CHECK-LABEL: test_fptoui_4xhalf_to_4xi8(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<13>;
-; CHECK-NEXT:    .reg .b32 %r<15>;
+; CHECK-NEXT:    .reg .b32 %r<14>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v2.u32 {%r3, %r4}, [test_fptoui_4xhalf_to_4xi8_param_0];
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r3;
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
 ; CHECK-NEXT:    cvt.rzi.u16.f16 %rs3, %rs2;
 ; CHECK-NEXT:    cvt.rzi.u16.f16 %rs4, %rs1;
 ; CHECK-NEXT:    mov.b32 %r5, {%rs4, %rs3};
 ; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r5;
-; CHECK-NEXT:    cvt.u32.u16 %r6, %rs5;
-; CHECK-NEXT:    cvt.u32.u16 %r7, %rs6;
-; CHECK-NEXT:    bfi.b32 %r8, %r7, %r6, 8, 8;
-; CHECK-NEXT:    mov.b32 {%rs7, %rs8}, %r4;
+; CHECK-NEXT:    cvt.u32.u16 %r6, %rs6;
+; CHECK-NEXT:    cvt.u32.u16 %r7, %rs5;
+; CHECK-NEXT:    prmt.b32 %r8, %r7, %r6, 16435;
+; CHECK-NEXT:    mov.b32 {%rs7, %rs8}, %r3;
 ; CHECK-NEXT:    cvt.rzi.u16.f16 %rs9, %rs8;
 ; CHECK-NEXT:    cvt.rzi.u16.f16 %rs10, %rs7;
 ; CHECK-NEXT:    mov.b32 %r9, {%rs10, %rs9};
 ; CHECK-NEXT:    mov.b32 {%rs11, %rs12}, %r9;
-; CHECK-NEXT:    cvt.u32.u16 %r10, %rs11;
-; CHECK-NEXT:    bfi.b32 %r11, %r10, %r8, 16, 8;
-; CHECK-NEXT:    cvt.u32.u16 %r12, %rs12;
-; CHECK-NEXT:    bfi.b32 %r13, %r12, %r11, 24, 8;
+; CHECK-NEXT:    cvt.u32.u16 %r10, %rs12;
+; CHECK-NEXT:    cvt.u32.u16 %r11, %rs11;
+; CHECK-NEXT:    prmt.b32 %r12, %r11, %r10, 13120;
+; CHECK-NEXT:    prmt.b32 %r13, %r12, %r8, 21520;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r13;
 ; CHECK-NEXT:    ret;
   %r = fptoui <4 x half> %a to <4 x i8>
@@ -1326,33 +1330,33 @@ define void @test_srem_v4i8(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    ld.param.u64 %rd1, [test_srem_v4i8_param_0];
 ; CHECK-NEXT:    ld.u32 %r1, [%rd1];
 ; CHECK-NEXT:    ld.u32 %r2, [%rd2];
-; CHECK-NEXT:    bfe.s32 %r3, %r2, 0, 8;
+; CHECK-NEXT:    bfe.s32 %r3, %r2, 24, 8;
 ; CHECK-NEXT:    cvt.s8.s32 %rs1, %r3;
-; CHECK-NEXT:    bfe.s32 %r4, %r1, 0, 8;
+; CHECK-NEXT:    bfe.s32 %r4, %r1, 24, 8;
 ; CHECK-NEXT:    cvt.s8.s32 %rs2, %r4;
 ; CHECK-NEXT:    rem.s16 %rs3, %rs2, %rs1;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs3;
-; CHECK-NEXT:    bfe.s32 %r6, %r2, 8, 8;
+; CHECK-NEXT:    bfe.s32 %r6, %r2, 16, 8;
 ; CHECK-NEXT:    cvt.s8.s32 %rs4, %r6;
-; CHECK-NEXT:    bfe.s32 %r7, %r1, 8, 8;
+; CHECK-NEXT:    bfe.s32 %r7, %r1, 16, 8;
 ; CHECK-NEXT:    cvt.s8.s32 %rs5, %r7;
 ; CHECK-NEXT:    rem.s16 %rs6, %rs5, %rs4;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT:    bfi.b32 %r9, %r8, %r5, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r10, %r2, 16, 8;
+; CHECK-NEXT:    prmt.b32 %r9, %r8, %r5, 16435;
+; CHECK-NEXT:    bfe.s32 %r10, %r2, 8, 8;
 ; CHECK-NEXT:    cvt.s8.s32 %rs7, %r10;
-; CHECK-NEXT:    bfe.s32 %r11, %r1, 16, 8;
+; CHECK-NEXT:    bfe.s32 %r11, %r1, 8, 8;
 ; CHECK-NEXT:    cvt.s8.s32 %rs8, %r11;
 ; CHECK-NEXT:    rem.s16 %rs9, %rs8, %rs7;
 ; CHECK-NEXT:    cvt.u32.u16 %r12, %rs9;
-; CHECK-NEXT:    bfi.b32 %r13, %r12, %r9, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r14, %r2, 24, 8;
-; CHECK-NEXT:    cvt.s8.s32 %rs10, %r14;
-; CHECK-NEXT:    bfe.s32 %r15, %r1, 24, 8;
-; CHECK-NEXT:    cvt.s8.s32 %rs11, %r15;
+; CHECK-NEXT:    bfe.s32 %r13, %r2, 0, 8;
+; CHECK-NEXT:    cvt.s8.s32 %rs10, %r13;
+; CHECK-NEXT:    bfe.s32 %r14, %r1, 0, 8;
+; CHECK-NEXT:    cvt.s8.s32 %rs11, %r14;
 ; CHECK-NEXT:    rem.s16 %rs12, %rs11, %rs10;
-; CHECK-NEXT:    cvt.u32.u16 %r16, %rs12;
-; CHECK-NEXT:    bfi.b32 %r17, %r16, %r13, 24, 8;
+; CHECK-NEXT:    cvt.u32.u16 %r15, %rs12;
+; CHECK-NEXT:    prmt.b32 %r16, %r15, %r12, 13120;
+; CHECK-NEXT:    prmt.b32 %r17, %r16, %r9, 21520;
 ; CHECK-NEXT:    st.u32 [%rd3], %r17;
 ; CHECK-NEXT:    ret;
 entry:
@@ -1373,7 +1377,7 @@ define void @test_srem_v3i8(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: test_srem_v3i8(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<20>;
-; CHECK-NEXT:    .reg .b32 %r<16>;
+; CHECK-NEXT:    .reg .b32 %r<17>;
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
@@ -1392,25 +1396,25 @@ define void @test_srem_v3i8(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    or.b16 %rs9, %rs8, %rs6;
 ; CHECK-NEXT:    cvt.u32.u16 %r3, %rs9;
 ; CHECK-NEXT:    ld.s8 %rs10, [%rd2+2];
-; CHECK-NEXT:    bfe.s32 %r5, %r3, 0, 8;
+; CHECK-NEXT:    bfe.s32 %r5, %r3, 8, 8;
 ; CHECK-NEXT:    cvt.s8.s32 %rs11, %r5;
-; CHECK-NEXT:    bfe.s32 %r6, %r1, 0, 8;
+; CHECK-NEXT:    bfe.s32 %r6, %r1, 8, 8;
 ; CHECK-NEXT:    cvt.s8.s32 %rs12, %r6;
 ; CHECK-NEXT:    rem.s16 %rs13, %rs12, %rs11;
 ; CHECK-NEXT:    cvt.u32.u16 %r7, %rs13;
-; CHECK-NEXT:    bfe.s32 %r8, %r3, 8, 8;
+; CHECK-NEXT:    bfe.s32 %r8, %r3, 0, 8;
 ; CHECK-NEXT:    cvt.s8.s32 %rs14, %r8;
-; CHECK-NEXT:    bfe.s32 %r9, %r1, 8, 8;
+; CHECK-NEXT:    bfe.s32 %r9, %r1, 0, 8;
 ; CHECK-NEXT:    cvt.s8.s32 %rs15, %r9;
 ; CHECK-NEXT:    rem.s16 %rs16, %rs15, %rs14;
 ; CHECK-NEXT:    cvt.u32.u16 %r10, %rs16;
-; CHECK-NEXT:    bfi.b32 %r11, %r10, %r7, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r11, %r10, %r7, 13120;
 ; CHECK-NEXT:    // implicit-def: %r13
-; CHECK-NEXT:    bfi.b32 %r12, %r13, %r11, 16, 8;
-; CHECK-NEXT:    // implicit-def: %r15
-; CHECK-NEXT:    bfi.b32 %r14, %r15, %r12, 24, 8;
+; CHECK-NEXT:    // implicit-def: %r14
+; CHECK-NEXT:    prmt.b32 %r12, %r13, %r14, 16435;
+; CHECK-NEXT:    prmt.b32 %r15, %r11, %r12, 21520;
 ; CHECK-NEXT:    rem.s16 %rs17, %rs5, %rs10;
-; CHECK-NEXT:    cvt.u16.u32 %rs18, %r14;
+; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {%rs18, tmp}, %r15; }
 ; CHECK-NEXT:    st.u8 [%rd3], %rs18;
 ; CHECK-NEXT:    shr.u16 %rs19, %rs18, 8;
 ; CHECK-NEXT:    st.u8 [%rd3+1], %rs19;
@@ -1437,25 +1441,25 @@ define void @test_sext_v4i1_to_v4i8(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    ld.param.u64 %rd1, [test_sext_v4i1_to_v4i8_param_0];
 ; CHECK-NEXT:    ld.u32 %r1, [%rd1];
 ; CHECK-NEXT:    ld.u32 %r2, [%rd2];
-; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r3, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 0, 8;
 ; CHECK-NEXT:    setp.hi.u32 %p1, %r4, %r3;
-; CHECK-NEXT:    bfe.u32 %r5, %r2, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r6, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r5, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r1, 8, 8;
 ; CHECK-NEXT:    setp.hi.u32 %p2, %r6, %r5;
-; CHECK-NEXT:    bfe.u32 %r7, %r2, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r8, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r8, %r1, 16, 8;
 ; CHECK-NEXT:    setp.hi.u32 %p3, %r8, %r7;
-; CHECK-NEXT:    bfe.u32 %r9, %r2, 0, 8;
-; CHECK-NEXT:    bfe.u32 %r10, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r9, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r1, 24, 8;
 ; CHECK-NEXT:    setp.hi.u32 %p4, %r10, %r9;
 ; CHECK-NEXT:    selp.s32 %r11, -1, 0, %p4;
 ; CHECK-NEXT:    selp.s32 %r12, -1, 0, %p3;
-; CHECK-NEXT:    bfi.b32 %r13, %r12, %r11, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r13, %r12, %r11, 16435;
 ; CHECK-NEXT:    selp.s32 %r14, -1, 0, %p2;
-; CHECK-NEXT:    bfi.b32 %r15, %r14, %r13, 16, 8;
-; CHECK-NEXT:    selp.s32 %r16, -1, 0, %p1;
-; CHECK-NEXT:    bfi.b32 %r17, %r16, %r15, 24, 8;
+; CHECK-NEXT:    selp.s32 %r15, -1, 0, %p1;
+; CHECK-NEXT:    prmt.b32 %r16, %r15, %r14, 13120;
+; CHECK-NEXT:    prmt.b32 %r17, %r16, %r13, 21520;
 ; CHECK-NEXT:    st.u32 [%rd3], %r17;
 ; CHECK-NEXT:    ret;
 entry:
diff --git a/llvm/test/CodeGen/NVPTX/sext-setcc.ll b/llvm/test/CodeGen/NVPTX/sext-setcc.ll
index f471d47077cf0d..8b7e5235443f05 100644
--- a/llvm/test/CodeGen/NVPTX/sext-setcc.ll
+++ b/llvm/test/CodeGen/NVPTX/sext-setcc.ll
@@ -33,35 +33,35 @@ define <4 x i8> @sext_setcc_v4i1_to_v4i8(ptr %p) {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<5>;
 ; CHECK-NEXT:    .reg .b16 %rs<9>;
-; CHECK-NEXT:    .reg .b32 %r<14>;
+; CHECK-NEXT:    .reg .b32 %r<13>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    ld.param.u64 %rd1, [sext_setcc_v4i1_to_v4i8_param_0];
 ; CHECK-NEXT:    ld.u32 %r1, [%rd1];
-; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r2, %r1, 0, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
 ; CHECK-NEXT:    and.b16 %rs2, %rs1, 255;
 ; CHECK-NEXT:    setp.eq.s16 %p1, %rs2, 0;
-; CHECK-NEXT:    bfe.u32 %r3, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r3, %r1, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs3, %r3;
 ; CHECK-NEXT:    and.b16 %rs4, %rs3, 255;
 ; CHECK-NEXT:    setp.eq.s16 %p2, %rs4, 0;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r4;
 ; CHECK-NEXT:    and.b16 %rs6, %rs5, 255;
 ; CHECK-NEXT:    setp.eq.s16 %p3, %rs6, 0;
-; CHECK-NEXT:    bfe.u32 %r5, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r5, %r1, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r5;
 ; CHECK-NEXT:    and.b16 %rs8, %rs7, 255;
 ; CHECK-NEXT:    setp.eq.s16 %p4, %rs8, 0;
 ; CHECK-NEXT:    selp.s32 %r6, -1, 0, %p4;
 ; CHECK-NEXT:    selp.s32 %r7, -1, 0, %p3;
-; CHECK-NEXT:    bfi.b32 %r8, %r7, %r6, 8, 8;
+; CHECK-NEXT:    prmt.b32 %r8, %r7, %r6, 16435;
 ; CHECK-NEXT:    selp.s32 %r9, -1, 0, %p2;
-; CHECK-NEXT:    bfi.b32 %r10, %r9, %r8, 16, 8;
-; CHECK-NEXT:    selp.s32 %r11, -1, 0, %p1;
-; CHECK-NEXT:    bfi.b32 %r12, %r11, %r10, 24, 8;
+; CHECK-NEXT:    selp.s32 %r10, -1, 0, %p1;
+; CHECK-NEXT:    prmt.b32 %r11, %r10, %r9, 13120;
+; CHECK-NEXT:    prmt.b32 %r12, %r11, %r8, 21520;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r12;
 ; CHECK-NEXT:    ret;
 entry:

From c893e3d02d1f7b67880090485a030b79741bba1c Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Thu, 10 Oct 2024 10:24:59 -0700
Subject: [PATCH 051/177] [flang][runtime] Fix runtime crash after bad
 recoverable OPEN (#111454)

When an OPEN statement with a unit number fails in a recoverable manner,
the runtime needs to delete the ExternalFileUnit instance that was
created in the unit map. And we do this too soon -- that instance still
holds some of the I/O statement state that will be used by a later call
into the runtime for EndIoStatement.

Move the code that deletes the unit after a failed but recoverable OPEN
into ExternalIoStatementBase::EndIoStatement, and don't do things
afterwards that would need the I/O statement state that has been
destroyed.

Fixes https://github.com/llvm/llvm-project/issues/111404.
---
 flang/runtime/io-stmt.cpp | 14 +++++++++-----
 flang/runtime/io-stmt.h   |  2 ++
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/flang/runtime/io-stmt.cpp b/flang/runtime/io-stmt.cpp
index cd7a196335d31e..f24eb929ce748a 100644
--- a/flang/runtime/io-stmt.cpp
+++ b/flang/runtime/io-stmt.cpp
@@ -243,7 +243,15 @@ int ExternalIoStatementBase::EndIoStatement() {
   CompleteOperation();
   auto result{IoStatementBase::EndIoStatement()};
 #if !defined(RT_USE_PSEUDO_FILE_UNIT)
+  auto unitNumber{unit_.unitNumber()};
   unit_.EndIoStatement(); // annihilates *this in unit_.u_
+  if (destroy_) {
+    if (ExternalFileUnit *
+        toClose{ExternalFileUnit::LookUpForClose(unitNumber)}) {
+      toClose->Close(CloseStatus::Delete, *this);
+      toClose->DestroyClosed();
+    }
+  }
 #else
   // Fetch the unit pointer before *this disappears.
   ExternalFileUnit *unitPtr{&unit_};
@@ -329,11 +337,7 @@ void OpenStatementState::CompleteOperation() {
   }
   if (!wasExtant_ && InError()) {
     // Release the new unit on failure
-    if (ExternalFileUnit *
-        toClose{unit().LookUpForClose(unit().unitNumber())}) {
-      toClose->Close(CloseStatus::Delete, *this);
-      toClose->DestroyClosed();
-    }
+    set_destroy();
   }
   IoStatementBase::CompleteOperation();
 }
diff --git a/flang/runtime/io-stmt.h b/flang/runtime/io-stmt.h
index 2e0ca46078ecdc..1f1419b249e5e5 100644
--- a/flang/runtime/io-stmt.h
+++ b/flang/runtime/io-stmt.h
@@ -455,6 +455,7 @@ class ExternalIoStatementBase : public IoStatementBase {
   RT_API_ATTRS MutableModes &mutableModes();
   RT_API_ATTRS ConnectionState &GetConnectionState();
   RT_API_ATTRS int asynchronousID() const { return asynchronousID_; }
+  RT_API_ATTRS void set_destroy(bool yes = true) { destroy_ = yes; }
   RT_API_ATTRS int EndIoStatement();
   RT_API_ATTRS ExternalFileUnit *GetExternalFileUnit() const { return &unit_; }
   RT_API_ATTRS void SetAsynchronous();
@@ -463,6 +464,7 @@ class ExternalIoStatementBase : public IoStatementBase {
 private:
   ExternalFileUnit &unit_;
   int asynchronousID_{-1};
+  bool destroy_{false};
 };
 
 template <Direction DIR>

From 4f2b65fb80a4b27e5fb88db816ed0ce174c9b1b4 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Thu, 10 Oct 2024 10:25:19 -0700
Subject: [PATCH 052/177] [flang] Fix references to destroyed objects (#111582)

ProgramTree instances are created as the value of a local variable in
the Pre(const parser::ProgramUnit &) member function in name resolution.
But references to these ProgramTree instances can persist in
SubprogramNameDetails symbol table entries that might survive that
function call's lifetime, and lead to trouble later when (e.g.)
expression semantics needs to deal with a possible forward reference in
a function reference in an expression being processed later in
expression checking.

So put those ProgramTree instances into a longer-lived linked list
within the SemanticsContext.

Might fix some weird crashes reported on big-endian targets (AIX &
Solaris).
---
 flang/{lib => include/flang}/Semantics/program-tree.h | 4 ++--
 flang/include/flang/Semantics/semantics.h             | 7 ++++++-
 flang/lib/Semantics/program-tree.cpp                  | 8 ++++----
 flang/lib/Semantics/resolve-names.cpp                 | 5 +++--
 flang/lib/Semantics/semantics.cpp                     | 4 ++++
 5 files changed, 19 insertions(+), 9 deletions(-)
 rename flang/{lib => include/flang}/Semantics/program-tree.h (97%)

diff --git a/flang/lib/Semantics/program-tree.h b/flang/include/flang/Semantics/program-tree.h
similarity index 97%
rename from flang/lib/Semantics/program-tree.h
rename to flang/include/flang/Semantics/program-tree.h
index ab00261a964a13..1c89e6c175b964 100644
--- a/flang/lib/Semantics/program-tree.h
+++ b/flang/include/flang/Semantics/program-tree.h
@@ -9,8 +9,8 @@
 #ifndef FORTRAN_SEMANTICS_PROGRAM_TREE_H_
 #define FORTRAN_SEMANTICS_PROGRAM_TREE_H_
 
+#include "symbol.h"
 #include "flang/Parser/parse-tree.h"
-#include "flang/Semantics/symbol.h"
 #include <list>
 #include <variant>
 
@@ -35,7 +35,7 @@ class ProgramTree {
       std::list<common::Reference<const parser::GenericSpec>>;
 
   // Build the ProgramTree rooted at one of these program units.
-  static ProgramTree Build(const parser::ProgramUnit &, SemanticsContext &);
+  static ProgramTree &Build(const parser::ProgramUnit &, SemanticsContext &);
   static std::optional<ProgramTree> Build(
       const parser::MainProgram &, SemanticsContext &);
   static std::optional<ProgramTree> Build(
diff --git a/flang/include/flang/Semantics/semantics.h b/flang/include/flang/Semantics/semantics.h
index 606afbe288c38d..c981d86fbd94cb 100644
--- a/flang/include/flang/Semantics/semantics.h
+++ b/flang/include/flang/Semantics/semantics.h
@@ -9,6 +9,8 @@
 #ifndef FORTRAN_SEMANTICS_SEMANTICS_H_
 #define FORTRAN_SEMANTICS_SEMANTICS_H_
 
+#include "module-dependences.h"
+#include "program-tree.h"
 #include "scope.h"
 #include "symbol.h"
 #include "flang/Common/Fortran-features.h"
@@ -17,7 +19,6 @@
 #include "flang/Evaluate/intrinsics.h"
 #include "flang/Evaluate/target.h"
 #include "flang/Parser/message.h"
-#include "flang/Semantics/module-dependences.h"
 #include <iosfwd>
 #include <set>
 #include <string>
@@ -280,6 +281,9 @@ class SemanticsContext {
 
   void DumpSymbols(llvm::raw_ostream &);
 
+  // Top-level ProgramTrees are owned by the SemanticsContext for persistence.
+  ProgramTree &SaveProgramTree(ProgramTree &&);
+
 private:
   struct ScopeIndexComparator {
     bool operator()(parser::CharBlock, parser::CharBlock) const;
@@ -331,6 +335,7 @@ class SemanticsContext {
   ModuleDependences moduleDependences_;
   std::map<const Symbol *, SourceName> moduleFileOutputRenamings_;
   UnorderedSymbolSet isDefined_;
+  std::list<ProgramTree> programTrees_;
 };
 
 class Semantics {
diff --git a/flang/lib/Semantics/program-tree.cpp b/flang/lib/Semantics/program-tree.cpp
index 250f5801b39e1a..86085e78803a23 100644
--- a/flang/lib/Semantics/program-tree.cpp
+++ b/flang/lib/Semantics/program-tree.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "program-tree.h"
+#include "flang/Semantics/program-tree.h"
 #include "flang/Common/idioms.h"
 #include "flang/Parser/char-block.h"
 #include "flang/Semantics/scope.h"
@@ -130,13 +130,13 @@ static ProgramTree BuildModuleTree(
   return node;
 }
 
-ProgramTree ProgramTree::Build(
+ProgramTree &ProgramTree::Build(
     const parser::ProgramUnit &x, SemanticsContext &context) {
   return common::visit(
-      [&](const auto &y) {
+      [&](const auto &y) -> ProgramTree & {
         auto node{Build(y.value(), context)};
         CHECK(node.has_value());
-        return std::move(*node);
+        return context.SaveProgramTree(std::move(*node));
       },
       x.u);
 }
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index e5e03f644f1b00..f1ce0b415ebe9c 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -10,7 +10,6 @@
 #include "definable.h"
 #include "mod-file.h"
 #include "pointer-assignment.h"
-#include "program-tree.h"
 #include "resolve-directives.h"
 #include "resolve-names-utils.h"
 #include "rewrite-parse-tree.h"
@@ -32,6 +31,7 @@
 #include "flang/Parser/tools.h"
 #include "flang/Semantics/attr.h"
 #include "flang/Semantics/expression.h"
+#include "flang/Semantics/program-tree.h"
 #include "flang/Semantics/scope.h"
 #include "flang/Semantics/semantics.h"
 #include "flang/Semantics/symbol.h"
@@ -2490,6 +2490,7 @@ Symbol &ScopeHandler::CopySymbol(const SourceName &name, const Symbol &symbol) {
 }
 
 // Look for name only in scope, not in enclosing scopes.
+
 Symbol *ScopeHandler::FindInScope(
     const Scope &scope, const parser::Name &name) {
   return Resolve(name, FindInScope(scope, name.source));
@@ -9120,7 +9121,7 @@ bool ResolveNamesVisitor::Pre(const parser::ProgramUnit &x) {
     ResolveAccParts(context(), x, &topScope_);
     return false;
   }
-  auto root{ProgramTree::Build(x, context())};
+  ProgramTree &root{ProgramTree::Build(x, context())};
   SetScope(topScope_);
   ResolveSpecificationParts(root);
   FinishSpecificationParts(root);
diff --git a/flang/lib/Semantics/semantics.cpp b/flang/lib/Semantics/semantics.cpp
index 637088ff0171c0..58dc1f218b56f4 100644
--- a/flang/lib/Semantics/semantics.cpp
+++ b/flang/lib/Semantics/semantics.cpp
@@ -663,6 +663,10 @@ void SemanticsContext::DumpSymbols(llvm::raw_ostream &os) {
   DoDumpSymbols(os, globalScope());
 }
 
+ProgramTree &SemanticsContext::SaveProgramTree(ProgramTree &&tree) {
+  return programTrees_.emplace_back(std::move(tree));
+}
+
 void Semantics::DumpSymbols(llvm::raw_ostream &os) { context_.DumpSymbols(os); }
 
 void Semantics::DumpSymbolsSources(llvm::raw_ostream &os) const {

From 2f22656db541e4e5c3401e7bbab25277c8438a23 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Thu, 10 Oct 2024 10:25:42 -0700
Subject: [PATCH 053/177] [flang] Minor cleanup (move function into /tools.cpp)
 (#111587)

The semantics utility GetAllNames has declarations in two header files
and a definition that really should be in the common utilities source
file. Remove the redudant declaration from resolve-names-utils.h and
move code from resolve-names-utils.cpp into Semantics/tools.cpp.
---
 flang/lib/Semantics/resolve-names-utils.cpp | 33 ---------------------
 flang/lib/Semantics/resolve-names-utils.h   |  5 ----
 flang/lib/Semantics/tools.cpp               | 31 +++++++++++++++++++
 3 files changed, 31 insertions(+), 38 deletions(-)

diff --git a/flang/lib/Semantics/resolve-names-utils.cpp b/flang/lib/Semantics/resolve-names-utils.cpp
index b8ce8d14a33faa..a838d49c06104d 100644
--- a/flang/lib/Semantics/resolve-names-utils.cpp
+++ b/flang/lib/Semantics/resolve-names-utils.cpp
@@ -31,8 +31,6 @@ using common::NumericOperator;
 using common::RelationalOperator;
 using IntrinsicOperator = parser::DefinedOperator::IntrinsicOperator;
 
-static constexpr const char *operatorPrefix{"operator("};
-
 static GenericKind MapIntrinsicOperator(IntrinsicOperator);
 
 Symbol *Resolve(const parser::Name &name, Symbol *symbol) {
@@ -69,37 +67,6 @@ bool IsIntrinsicOperator(
   return false;
 }
 
-template <typename E>
-std::forward_list<std::string> GetOperatorNames(
-    const SemanticsContext &context, E opr) {
-  std::forward_list<std::string> result;
-  for (const char *name : context.languageFeatures().GetNames(opr)) {
-    result.emplace_front(std::string{operatorPrefix} + name + ')');
-  }
-  return result;
-}
-
-std::forward_list<std::string> GetAllNames(
-    const SemanticsContext &context, const SourceName &name) {
-  std::string str{name.ToString()};
-  if (!name.empty() && name.end()[-1] == ')' &&
-      name.ToString().rfind(std::string{operatorPrefix}, 0) == 0) {
-    for (int i{0}; i != common::LogicalOperator_enumSize; ++i) {
-      auto names{GetOperatorNames(context, LogicalOperator{i})};
-      if (llvm::is_contained(names, str)) {
-        return names;
-      }
-    }
-    for (int i{0}; i != common::RelationalOperator_enumSize; ++i) {
-      auto names{GetOperatorNames(context, RelationalOperator{i})};
-      if (llvm::is_contained(names, str)) {
-        return names;
-      }
-    }
-  }
-  return {str};
-}
-
 bool IsLogicalConstant(
     const SemanticsContext &context, const SourceName &name) {
   std::string str{name.ToString()};
diff --git a/flang/lib/Semantics/resolve-names-utils.h b/flang/lib/Semantics/resolve-names-utils.h
index 5b537d80e5f880..64784722ff4f84 100644
--- a/flang/lib/Semantics/resolve-names-utils.h
+++ b/flang/lib/Semantics/resolve-names-utils.h
@@ -51,11 +51,6 @@ parser::MessageFixedText WithSeverity(
 bool IsIntrinsicOperator(const SemanticsContext &, const SourceName &);
 bool IsLogicalConstant(const SemanticsContext &, const SourceName &);
 
-// Some intrinsic operators have more than one name (e.g. `operator(.eq.)` and
-// `operator(==)`). GetAllNames() returns them all, including symbolName.
-std::forward_list<std::string> GetAllNames(
-    const SemanticsContext &, const SourceName &);
-
 template <typename T>
 MaybeIntExpr EvaluateIntExpr(SemanticsContext &context, const T &expr) {
   if (MaybeExpr maybeExpr{
diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp
index 4d2a0a607abe89..379d5d0eb3eef0 100644
--- a/flang/lib/Semantics/tools.cpp
+++ b/flang/lib/Semantics/tools.cpp
@@ -1654,6 +1654,37 @@ bool HasDefinedIo(common::DefinedIo which, const DerivedTypeSpec &derived,
   return parentType && HasDefinedIo(which, *parentType, scope);
 }
 
+template <typename E>
+std::forward_list<std::string> GetOperatorNames(
+    const SemanticsContext &context, E opr) {
+  std::forward_list<std::string> result;
+  for (const char *name : context.languageFeatures().GetNames(opr)) {
+    result.emplace_front("operator("s + name + ')');
+  }
+  return result;
+}
+
+std::forward_list<std::string> GetAllNames(
+    const SemanticsContext &context, const SourceName &name) {
+  std::string str{name.ToString()};
+  if (!name.empty() && name.end()[-1] == ')' &&
+      name.ToString().rfind("operator(", 0) == 0) {
+    for (int i{0}; i != common::LogicalOperator_enumSize; ++i) {
+      auto names{GetOperatorNames(context, common::LogicalOperator{i})};
+      if (llvm::is_contained(names, str)) {
+        return names;
+      }
+    }
+    for (int i{0}; i != common::RelationalOperator_enumSize; ++i) {
+      auto names{GetOperatorNames(context, common::RelationalOperator{i})};
+      if (llvm::is_contained(names, str)) {
+        return names;
+      }
+    }
+  }
+  return {str};
+}
+
 void WarnOnDeferredLengthCharacterScalar(SemanticsContext &context,
     const SomeExpr *expr, parser::CharBlock at, const char *what) {
   if (context.languageFeatures().ShouldWarn(

From 7e16571eb02e7e9da24fee45359e981af783d0d0 Mon Sep 17 00:00:00 2001
From: Adrian Vogelsgesang <avogelsgesang@salesforce.com>
Date: Thu, 10 Oct 2024 19:27:27 +0200
Subject: [PATCH 054/177] [lldb][libc++] Hide all libc++ implementation details
 from stacktraces (#108870)

This commit changes the libc++ frame recognizer to hide implementation
details of libc++ more aggressively. The applied heuristic is rather
straightforward: We consider every function name starting with `__` as
an implementation detail.

This works pretty neatly for `std::invoke`, `std::function`,
`std::sort`, `std::map::emplace` and many others. Also, this should
align quite nicely with libc++'s general coding convention of using the
`__` for their implementation details, thereby keeping the future
maintenance effort low.

However, this heuristic by itself does not work in 100% of the cases:
E.g., `std::ranges::sort` is not a function, but an object with an
overloaded `operator()`, which means that there is no actual call
`std::ranges::sort` in the call stack. Instead, there is a
`std::ranges::__sort::operator()` call. To make sure that we don't hide
this stack frame, we never hide the frame which represents the entry
point from user code into libc++ code
---
 libcxx/docs/UserDocumentation.rst             | 29 +++++++
 .../CPlusPlus/CPPLanguageRuntime.cpp          | 49 ++++++-----
 .../Makefile                                  |  2 +-
 .../TestLibcxxInternalsRecognizer.py          | 67 +++++++++++++++
 .../cpp/libcxx-internals-recognizer/main.cpp  | 86 +++++++++++++++++++
 .../TestStdInvokeRecognizer.py                | 44 ----------
 .../lang/cpp/std-invoke-recognizer/main.cpp   | 30 -------
 7 files changed, 211 insertions(+), 96 deletions(-)
 rename lldb/test/API/lang/cpp/{std-invoke-recognizer => libcxx-internals-recognizer}/Makefile (68%)
 create mode 100644 lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py
 create mode 100644 lldb/test/API/lang/cpp/libcxx-internals-recognizer/main.cpp
 delete mode 100644 lldb/test/API/lang/cpp/std-invoke-recognizer/TestStdInvokeRecognizer.py
 delete mode 100644 lldb/test/API/lang/cpp/std-invoke-recognizer/main.cpp

diff --git a/libcxx/docs/UserDocumentation.rst b/libcxx/docs/UserDocumentation.rst
index f5e55994aa7572..1db437ce58b95e 100644
--- a/libcxx/docs/UserDocumentation.rst
+++ b/libcxx/docs/UserDocumentation.rst
@@ -355,6 +355,35 @@ Third-party Integrations
 
 Libc++ provides integration with a few third-party tools.
 
+Debugging libc++ internals in LLDB
+----------------------------------
+
+LLDB hides the implementation details of libc++ by default.
+
+E.g., when setting a breakpoint in a comparator passed to ``std::sort``, the
+backtrace will read as
+
+.. code-block::
+
+  (lldb) thread backtrace
+  * thread #1, name = 'a.out', stop reason = breakpoint 3.1
+    * frame #0: 0x000055555555520e a.out`my_comparator(a=1, b=8) at test-std-sort.cpp:6:3
+      frame #7: 0x0000555555555615 a.out`void std::__1::sort[abi:ne200000]<std::__1::__wrap_iter<int*>, bool (*)(int, int)>(__first=(item = 8), __last=(item = 0), __comp=(a.out`my_less(int, int) at test-std-sort.cpp:5)) at sort.h:1003:3
+      frame #8: 0x000055555555531a a.out`main at test-std-sort.cpp:24:3
+
+Note how the caller of ``my_comparator`` is shown as ``std::sort``. Looking at
+the frame numbers, we can see that frames #1 until #6 were hidden. Those frames
+represent internal implementation details such as ``__sort4`` and similar
+utility functions.
+
+To also show those implementation details, use ``thread backtrace -u``.
+Alternatively, to disable those compact backtraces, use ``frame recognizer list``
+and ``frame recognizer disable`` on the "libc++ frame recognizer".
+
+Futhermore, stepping into libc++ functions is disabled by default. This is controlled via the
+setting ``target.process.thread.step-avoid-regexp`` which defaults to ``^std::`` and can be
+disabled using ``settings set target.process.thread.step-avoid-regexp ""``.
+
 GDB Pretty printers for libc++
 ------------------------------
 
diff --git a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp
index faa05e8f834ea1..e7ca3f655f237c 100644
--- a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp
@@ -45,7 +45,7 @@ char CPPLanguageRuntime::ID = 0;
 /// A frame recognizer that is installed to hide libc++ implementation
 /// details from the backtrace.
 class LibCXXFrameRecognizer : public StackFrameRecognizer {
-  std::array<RegularExpression, 4> m_hidden_regex;
+  std::array<RegularExpression, 2> m_hidden_regex;
   RecognizedStackFrameSP m_hidden_frame;
 
   struct LibCXXHiddenFrame : public RecognizedStackFrame {
@@ -55,28 +55,17 @@ class LibCXXFrameRecognizer : public StackFrameRecognizer {
 public:
   LibCXXFrameRecognizer()
       : m_hidden_regex{
-            // internal implementation details of std::function
+            // internal implementation details in the `std::` namespace
             //    std::__1::__function::__alloc_func<void (*)(), std::__1::allocator<void (*)()>, void ()>::operator()[abi:ne200000]
             //    std::__1::__function::__func<void (*)(), std::__1::allocator<void (*)()>, void ()>::operator()
             //    std::__1::__function::__value_func<void ()>::operator()[abi:ne200000]() const
-            RegularExpression{""
-              R"(^std::__[^:]*::)" // Namespace.
-              R"(__function::.*::operator\(\))"},
-            // internal implementation details of std::function in ABI v2
             //    std::__2::__function::__policy_invoker<void (int, int)>::__call_impl[abi:ne200000]<std::__2::__function::__default_alloc_func<int (*)(int, int), int (int, int)>>
-            RegularExpression{""
-              R"(^std::__[^:]*::)" // Namespace.
-              R"(__function::.*::__call_impl)"},
-            // internal implementation details of std::invoke
-            //   std::__1::__invoke[abi:ne200000]<void (*&)()>
-            RegularExpression{
-              R"(^std::__[^:]*::)" // Namespace.
-              R"(__invoke)"},
-            // internal implementation details of std::invoke
-            //   std::__1::__invoke_void_return_wrapper<void, true>::__call[abi:ne200000]<void (*&)()>
-            RegularExpression{
-              R"(^std::__[^:]*::)" // Namespace.
-              R"(__invoke_void_return_wrapper<.*>::__call)"}
+            //    std::__1::__invoke[abi:ne200000]<void (*&)()>
+            //    std::__1::__invoke_void_return_wrapper<void, true>::__call[abi:ne200000]<void (*&)()>
+            RegularExpression{R"(^std::__[^:]*::__)"},
+            // internal implementation details in the `std::ranges` namespace
+            //    std::__1::ranges::__sort::__sort_fn_impl[abi:ne200000]<std::__1::__wrap_iter<int*>, std::__1::__wrap_iter<int*>, bool (*)(int, int), std::__1::identity>
+            RegularExpression{R"(^std::__[^:]*::ranges::__)"},
         },
         m_hidden_frame(new LibCXXHiddenFrame()) {}
 
@@ -90,9 +79,27 @@ class LibCXXFrameRecognizer : public StackFrameRecognizer {
     if (!sc.function)
       return {};
 
-    for (RegularExpression &r : m_hidden_regex)
-      if (r.Execute(sc.function->GetNameNoArguments()))
+    // Check if we have a regex match
+    for (RegularExpression &r : m_hidden_regex) {
+      if (!r.Execute(sc.function->GetNameNoArguments()))
+        continue;
+
+      // Only hide this frame if the immediate caller is also within libc++.
+      lldb::ThreadSP thread_sp = frame_sp->GetThread();
+      if (!thread_sp)
+        return {};
+      lldb::StackFrameSP parent_frame_sp =
+          thread_sp->GetStackFrameAtIndex(frame_sp->GetFrameIndex() + 1);
+      if (!parent_frame_sp)
+        return {};
+      const auto &parent_sc =
+          parent_frame_sp->GetSymbolContext(lldb::eSymbolContextFunction);
+      if (!parent_sc.function)
+        return {};
+      if (parent_sc.function->GetNameNoArguments().GetStringRef().starts_with(
+              "std::"))
         return m_hidden_frame;
+    }
 
     return {};
   }
diff --git a/lldb/test/API/lang/cpp/std-invoke-recognizer/Makefile b/lldb/test/API/lang/cpp/libcxx-internals-recognizer/Makefile
similarity index 68%
rename from lldb/test/API/lang/cpp/std-invoke-recognizer/Makefile
rename to lldb/test/API/lang/cpp/libcxx-internals-recognizer/Makefile
index 69014eb9c0f2eb..bb571299664934 100644
--- a/lldb/test/API/lang/cpp/std-invoke-recognizer/Makefile
+++ b/lldb/test/API/lang/cpp/libcxx-internals-recognizer/Makefile
@@ -1,5 +1,5 @@
 CXX_SOURCES := main.cpp
 USE_LIBCPP := 1
-CXXFLAGS_EXTRAS := -std=c++17
+CXXFLAGS_EXTRAS := -std=c++20
 
 include Makefile.rules
diff --git a/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py b/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py
new file mode 100644
index 00000000000000..ad48208f21e502
--- /dev/null
+++ b/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py
@@ -0,0 +1,67 @@
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+
+
+class LibCxxInternalsRecognizerTestCase(TestBase):
+    NO_DEBUG_INFO_TESTCASE = True
+
+    @add_test_categories(["libc++"])
+    def test_frame_recognizer(self):
+        """Test that implementation details of libc++ are hidden"""
+        self.build()
+        (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint(
+            self, "break here", lldb.SBFileSpec("main.cpp")
+        )
+
+        expected_parents = {
+            "sort_less(int, int)": ["::sort", "test_algorithms"],
+            # `std::ranges::sort` is implemented as an object of types `__sort`.
+            # We never hide the frame of the entry-point into the standard library, even
+            # if the name starts with `__` which usually indicates an internal function.
+            "ranges_sort_less(int, int)": [
+                "ranges::__sort::operator()",
+                "test_algorithms",
+            ],
+            # `ranges::views::transform` internally uses `std::invoke`, and that
+            # call also shows up in the stack trace
+            "view_transform(int)": [
+                "::invoke",
+                "ranges::transform_view",
+                "test_algorithms",
+            ],
+            # Various types of `invoke` calls
+            "consume_number(int)": ["::invoke", "test_invoke"],
+            "invoke_add(int, int)": ["::invoke", "test_invoke"],
+            "Callable::member_function(int) const": ["::invoke", "test_invoke"],
+            "Callable::operator()(int) const": ["::invoke", "test_invoke"],
+            # Containers
+            "MyKey::operator<(MyKey const&) const": [
+                "less",
+                "::emplace",
+                "test_containers",
+            ],
+        }
+        stop_set = set()
+        while process.GetState() != lldb.eStateExited:
+            fn = thread.GetFrameAtIndex(0).GetFunctionName()
+            stop_set.add(fn)
+            self.assertIn(fn, expected_parents.keys())
+            frame_id = 1
+            for expected_parent in expected_parents[fn]:
+                # Skip all hidden frames
+                while (
+                    frame_id < thread.GetNumFrames()
+                    and thread.GetFrameAtIndex(frame_id).IsHidden()
+                ):
+                    frame_id = frame_id + 1
+                # Expect the correct parent frame
+                self.assertIn(
+                    expected_parent, thread.GetFrameAtIndex(frame_id).GetFunctionName()
+                )
+                frame_id = frame_id + 1
+            process.Continue()
+
+        # Make sure that we actually verified all intended scenarios
+        self.assertEqual(len(stop_set), len(expected_parents))
diff --git a/lldb/test/API/lang/cpp/libcxx-internals-recognizer/main.cpp b/lldb/test/API/lang/cpp/libcxx-internals-recognizer/main.cpp
new file mode 100644
index 00000000000000..870301b0970439
--- /dev/null
+++ b/lldb/test/API/lang/cpp/libcxx-internals-recognizer/main.cpp
@@ -0,0 +1,86 @@
+#include <algorithm>
+#include <functional>
+#include <map>
+#include <ranges>
+#include <vector>
+
+bool sort_less(int a, int b) {
+  __builtin_printf("break here");
+  return a < b;
+}
+
+bool ranges_sort_less(int a, int b) {
+  __builtin_printf("break here");
+  return a < b;
+}
+
+int view_transform(int a) {
+  __builtin_printf("break here");
+  return a * a;
+}
+
+void test_algorithms() {
+  std::vector<int> vec{8, 1, 3, 2};
+
+  // The internal frames for `std::sort` should be hidden
+  std::sort(vec.begin(), vec.end(), sort_less);
+
+  // The internal frames for `ranges::sort` should be hidden
+  std::ranges::sort(vec.begin(), vec.end(), ranges_sort_less);
+
+  // Same for views
+  for (auto x : vec | std::ranges::views::transform(view_transform)) {
+    // no-op
+  }
+}
+
+void consume_number(int i) { __builtin_printf("break here"); }
+
+int invoke_add(int i, int j) {
+  __builtin_printf("break here");
+  return i + j;
+}
+
+struct Callable {
+  Callable(int num) : num_(num) {}
+  void operator()(int i) const { __builtin_printf("break here"); }
+  void member_function(int i) const { __builtin_printf("break here"); }
+  int num_;
+};
+
+void test_invoke() {
+  // Invoke a void-returning function
+  std::invoke(consume_number, -9);
+
+  // Invoke a non-void-returning function
+  std::invoke(invoke_add, 1, 10);
+
+  // Invoke a member function
+  const Callable foo(314159);
+  std::invoke(&Callable::member_function, foo, 1);
+
+  // Invoke a function object
+  std::invoke(Callable(12), 18);
+}
+
+struct MyKey {
+  int x;
+  bool operator==(const MyKey &) const = default;
+  bool operator<(const MyKey &other) const {
+    __builtin_printf("break here");
+    return x < other.x;
+  }
+};
+
+void test_containers() {
+  std::map<MyKey, int> map;
+  map.emplace(MyKey{1}, 2);
+  map.emplace(MyKey{2}, 3);
+}
+
+int main() {
+  test_algorithms();
+  test_invoke();
+  test_containers();
+  return 0;
+}
diff --git a/lldb/test/API/lang/cpp/std-invoke-recognizer/TestStdInvokeRecognizer.py b/lldb/test/API/lang/cpp/std-invoke-recognizer/TestStdInvokeRecognizer.py
deleted file mode 100644
index dbe29610bf7982..00000000000000
--- a/lldb/test/API/lang/cpp/std-invoke-recognizer/TestStdInvokeRecognizer.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import lldb
-from lldbsuite.test.decorators import *
-from lldbsuite.test.lldbtest import *
-from lldbsuite.test import lldbutil
-
-
-class LibCxxStdFunctionRecognizerTestCase(TestBase):
-    NO_DEBUG_INFO_TESTCASE = True
-
-    @add_test_categories(["libc++"])
-    def test_frame_recognizer(self):
-        """Test that implementation details of `std::invoke` are hidden"""
-        self.build()
-        (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint(
-            self, "break here", lldb.SBFileSpec("main.cpp")
-        )
-
-        stop_cnt = 0
-        while process.GetState() != lldb.eStateExited:
-            stop_cnt += 1
-            self.assertTrue(
-                any(
-                    f in thread.GetFrameAtIndex(0).GetFunctionName()
-                    for f in ["consume_number", "add", "Callable"]
-                )
-            )
-            # Skip all hidden frames
-            frame_id = 1
-            while (
-                frame_id < thread.GetNumFrames()
-                and thread.GetFrameAtIndex(frame_id).IsHidden()
-            ):
-                frame_id = frame_id + 1
-            # Expect `std::invoke` to be the direct parent
-            self.assertIn(
-                "::invoke", thread.GetFrameAtIndex(frame_id).GetFunctionName()
-            )
-            # And right above that, there should be the `main` frame
-            self.assertIn(
-                "main", thread.GetFrameAtIndex(frame_id + 1).GetFunctionName()
-            )
-            process.Continue()
-
-        self.assertEqual(stop_cnt, 4)
diff --git a/lldb/test/API/lang/cpp/std-invoke-recognizer/main.cpp b/lldb/test/API/lang/cpp/std-invoke-recognizer/main.cpp
deleted file mode 100644
index bafbbd28386e8b..00000000000000
--- a/lldb/test/API/lang/cpp/std-invoke-recognizer/main.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-#include <functional>
-
-void consume_number(int i) { __builtin_printf("break here"); }
-
-int add(int i, int j) {
-  // break here
-  return i + j;
-}
-
-struct Callable {
-  Callable(int num) : num_(num) {}
-  void operator()(int i) const { __builtin_printf("break here"); }
-  void member_function(int i) const { __builtin_printf("break here"); }
-  int num_;
-};
-
-int main() {
-  // Invoke a void-returning function
-  std::invoke(consume_number, -9);
-
-  // Invoke a non-void-returning function
-  std::invoke(add, 1, 10);
-
-  // Invoke a member function
-  const Callable foo(314159);
-  std::invoke(&Callable::member_function, foo, 1);
-
-  // Invoke a function object
-  std::invoke(Callable(12), 18);
-}

From 7026960ecfe156223c4126495c146ce0d42c64a7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Thu, 10 Oct 2024 10:31:03 -0700
Subject: [PATCH 055/177] [flang][runtime][NFC] Fix header guard typo (#111741)

Header guard was in sync with the filename.
---
 flang/include/flang/Runtime/allocator-registry.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/flang/include/flang/Runtime/allocator-registry.h b/flang/include/flang/Runtime/allocator-registry.h
index acfada506fafc6..3ccee56dc3fc0f 100644
--- a/flang/include/flang/Runtime/allocator-registry.h
+++ b/flang/include/flang/Runtime/allocator-registry.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef FORTRAN_RUNTIME_ALLOCATOR_H_
-#define FORTRAN_RUNTIME_ALLOCATOR_H_
+#ifndef FORTRAN_RUNTIME_ALLOCATOR_REGISTRY_H_
+#define FORTRAN_RUNTIME_ALLOCATOR_REGISTRY_H_
 
 #include "flang/Common/api-attrs.h"
 #include <cstdlib>
@@ -62,4 +62,4 @@ RT_OFFLOAD_VAR_GROUP_END
 
 } // namespace Fortran::runtime
 
-#endif // FORTRAN_RUNTIME_ALLOCATOR_H_
+#endif // FORTRAN_RUNTIME_ALLOCATOR_REGISTRY_H_

From 99c8557c175e88ff1c338c4c29e3a4d63c5a46cb Mon Sep 17 00:00:00 2001
From: Renato Golin <rengolin@systemcall.eu>
Date: Thu, 10 Oct 2024 18:52:20 +0100
Subject: [PATCH 056/177] Fix GCC build problem with 03483737a7a2

---
 mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index 4f350ea236da84..c909d13e4314b4 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -157,7 +157,7 @@ static void fillStructuredOpRegion(OpBuilder &opBuilder, Region &region,
 
 /// Helper to create a typical indexing map for MatmulOp. Returns a list of
 /// AffineMap.
-static SmallVector<AffineMap>
+static SmallVector<AffineMap, 3>
 getDefaultIndexingMapsForMatmul(MLIRContext *context) {
   AffineExpr d0, d1, d2;
   SmallVector<AffineMap, 3> indexingMaps;

From 453d373e80f3ed8d67c92956101f7b9fa9467116 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Tue, 8 Oct 2024 23:22:44 -0700
Subject: [PATCH 057/177] [lsan] Add a few "\n" missing from VReport

---
 compiler-rt/lib/lsan/lsan_common.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/compiler-rt/lib/lsan/lsan_common.cpp b/compiler-rt/lib/lsan/lsan_common.cpp
index 438aa3a85f6724..6776598651ae9b 100644
--- a/compiler-rt/lib/lsan/lsan_common.cpp
+++ b/compiler-rt/lib/lsan/lsan_common.cpp
@@ -780,10 +780,10 @@ static bool PrintResults(LeakReport &report) {
 
 static bool CheckForLeaks() {
   if (&__lsan_is_turned_off && __lsan_is_turned_off()) {
-    VReport(1, "LeakSanitizer is disabled");
+    VReport(1, "LeakSanitizer is disabled\n");
     return false;
   }
-  VReport(1, "LeakSanitizer: checking for leaks");
+  VReport(1, "LeakSanitizer: checking for leaks\n");
   // Inside LockStuffAndStopTheWorld we can't run symbolizer, so we can't match
   // suppressions. However if a stack id was previously suppressed, it should be
   // suppressed in future checks as well.

From 62b3a4bc708885f8ded09c900a79ad509f02e54a Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Thu, 10 Oct 2024 19:40:02 +0100
Subject: [PATCH 058/177] [AMDGPU] Improve codegen for s_barrier_init (#111866)

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp              | 4 +---
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll | 4 +---
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index bbdc006b9afcf0..3d8e03521e2b90 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -10031,9 +10031,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
         // If reference to barrier id is not an inline constant then it must be
         // referenced with M0[4:0]. Perform an OR with the member count to
         // include it in M0.
-        M0Val = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32,
-                                           Op.getOperand(2), M0Val),
-                        0);
+        M0Val = DAG.getNode(ISD::OR, DL, MVT::i32, Op.getOperand(2), M0Val);
       }
       Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
     } else if (IsInlinableBarID) {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
index 4fb28b392c9ea9..1e13b40afb8be8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
@@ -737,11 +737,9 @@ define void @test5_s_barrier_init_m0(i32 %arg1 ,i32 %arg2) {
 ; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-SDAG-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-NEXT:    s_mov_b32 m0, s0
 ; GFX12-SDAG-NEXT:    s_barrier_init m0
 ; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe

From ba530e6b64a27876ef5ea8e29806260d8bc00926 Mon Sep 17 00:00:00 2001
From: Keith Smiley <keithbsmiley@gmail.com>
Date: Thu, 10 Oct 2024 11:42:23 -0700
Subject: [PATCH 059/177] [bazel] Add initial clang-doc config (#111779)

---
 .../clang-tools-extra/clang-doc/BUILD.bazel   | 45 +++++++++++++++++++
 .../clang-tools-extra/unittests/BUILD.bazel   | 21 +++++++++
 2 files changed, 66 insertions(+)
 create mode 100644 utils/bazel/llvm-project-overlay/clang-tools-extra/clang-doc/BUILD.bazel

diff --git a/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-doc/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-doc/BUILD.bazel
new file mode 100644
index 00000000000000..d7b9723b875c37
--- /dev/null
+++ b/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-doc/BUILD.bazel
@@ -0,0 +1,45 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+)
+
+licenses(["notice"])
+
+cc_library(
+    name = "lib",
+    srcs = glob(["*.cpp"]),
+    hdrs = glob(["*.h"]),
+    includes = ["."],
+    deps = [
+        "//clang:ast",
+        "//clang:basic",
+        "//clang:frontend",
+        "//clang:index",
+        "//clang:lex",
+        "//clang:tooling",
+        "//llvm:BitstreamReader",
+        "//llvm:BitstreamWriter",
+        "//llvm:Support",
+    ],
+)
+
+cc_binary(
+    name = "clang-doc",
+    srcs = ["tool/ClangDocMain.cpp"],
+    stamp = 0,
+    deps = [
+        ":lib",
+        "//clang:ast",
+        "//clang:ast_matchers",
+        "//clang:driver",
+        "//clang:frontend",
+        "//clang:tooling",
+        "//llvm:Support",
+    ],
+)
diff --git a/utils/bazel/llvm-project-overlay/clang-tools-extra/unittests/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang-tools-extra/unittests/BUILD.bazel
index 12e87cec4b76b8..47ec4552856416 100644
--- a/utils/bazel/llvm-project-overlay/clang-tools-extra/unittests/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang-tools-extra/unittests/BUILD.bazel
@@ -53,3 +53,24 @@ cc_test(
         "//third-party/unittest:gtest_main",
     ],
 )
+
+cc_test(
+    name = "clang_doc_test",
+    size = "small",
+    srcs = glob(
+        [
+            "clang-doc/*.cpp",
+            "clang-doc/*.h",
+        ],
+        allow_empty = False,
+    ),
+    deps = [
+        "//clang:ast",
+        "//clang:basic",
+        "//clang-tools-extra/clang-doc:lib",
+        "//llvm:BitstreamReader",
+        "//llvm:BitstreamWriter",
+        "//third-party/unittest:gtest",
+        "//third-party/unittest:gtest_main",
+    ],
+)

From d36cef0b173329fa1f94ff3a92da6a50da4aff9e Mon Sep 17 00:00:00 2001
From: Finn Plummer <50529406+inbelic@users.noreply.github.com>
Date: Thu, 10 Oct 2024 11:44:44 -0700
Subject: [PATCH 060/177] [HLSL][DXIL] Implement WaveGetLaneIndex Intrinsic
 (#111576)

- add additional lowering for directx backend in CGBuiltin.cpp
    - add directx intrinsic to IntrinsicsDirectX.td
    - add semantic check of arguments in SemaHLSL.cpp
    - add mapping to DXIL op in DXIL.td

    - add testing of semantics in WaveGetLaneIndex-errors.hlsl
    - add testing of dxil lowering in WaveGetLaneIndex.ll

Resolves #70105
---
 clang/lib/CodeGen/CGBuiltin.cpp               | 18 ++++++++++++++---
 clang/lib/Sema/SemaHLSL.cpp                   |  5 +++++
 .../builtins/wave_get_lane_index_simple.hlsl  | 20 +++++++++++++------
 .../BuiltIns/WaveGetLaneIndex-errors.hlsl     |  6 ++++++
 llvm/include/llvm/IR/IntrinsicsDirectX.td     |  1 +
 llvm/lib/Target/DirectX/DXIL.td               |  9 +++++++++
 llvm/test/CodeGen/DirectX/WaveGetLaneIndex.ll | 10 ++++++++++
 7 files changed, 60 insertions(+), 9 deletions(-)
 create mode 100644 clang/test/SemaHLSL/BuiltIns/WaveGetLaneIndex-errors.hlsl
 create mode 100644 llvm/test/CodeGen/DirectX/WaveGetLaneIndex.ll

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 2449b90a0e7902..06140d6d4ce27b 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18867,9 +18867,21 @@ case Builtin::BI__builtin_hlsl_elementwise_isinf: {
         ArrayRef<Value *>{Op0, Op1}, nullptr, "hlsl.step");
   }
   case Builtin::BI__builtin_hlsl_wave_get_lane_index: {
-    return EmitRuntimeCall(CGM.CreateRuntimeFunction(
-        llvm::FunctionType::get(IntTy, {}, false), "__hlsl_wave_get_lane_index",
-        {}, false, true));
+    // We don't define a SPIR-V intrinsic, instead it is a SPIR-V built-in
+    // defined in SPIRVBuiltins.td. So instead we manually get the matching name
+    // for the DirectX intrinsic and the demangled builtin name
+    switch (CGM.getTarget().getTriple().getArch()) {
+    case llvm::Triple::dxil:
+      return EmitRuntimeCall(Intrinsic::getDeclaration(
+          &CGM.getModule(), Intrinsic::dx_wave_getlaneindex));
+    case llvm::Triple::spirv:
+      return EmitRuntimeCall(CGM.CreateRuntimeFunction(
+          llvm::FunctionType::get(IntTy, {}, false),
+          "__hlsl_wave_get_lane_index", {}, false, true));
+    default:
+      llvm_unreachable(
+          "Intrinsic WaveGetLaneIndex not supported by target architecture");
+    }
   }
   case Builtin::BI__builtin_hlsl_wave_is_first_lane: {
     Intrinsic::ID ID = CGM.getHLSLRuntime().getWaveIsFirstLaneIntrinsic();
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 05e6e7800112df..b0acbbbbb2b1f0 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -1992,6 +1992,11 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
       return true;
     break;
   }
+  case Builtin::BI__builtin_hlsl_wave_get_lane_index: {
+    if (SemaRef.checkArgCount(TheCall, 0))
+      return true;
+    break;
+  }
   case Builtin::BI__builtin_elementwise_acos:
   case Builtin::BI__builtin_elementwise_asin:
   case Builtin::BI__builtin_elementwise_atan:
diff --git a/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_simple.hlsl b/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_simple.hlsl
index 8f52d81091c180..06a2715b00e969 100644
--- a/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_simple.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_simple.hlsl
@@ -1,14 +1,22 @@
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-pc-vulkan-library %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+// RUN:   spirv-pc-vulkan-library %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
+// RUN:   --check-prefixes=CHECK,CHECK-SPIRV
+// RUN: %clang_cc1 -finclude-default-header \
+// RUN:   -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
+// RUN:   --check-prefixes=CHECK,CHECK-DXIL
 
-// CHECK: define spir_func noundef i32 @_Z6test_1v() [[A0:#[0-9]+]] {
-// CHECK: %[[CI:[0-9]+]] = call token @llvm.experimental.convergence.entry()
-// CHECK: call i32 @__hlsl_wave_get_lane_index() [ "convergencectrl"(token %[[CI]]) ]
-uint test_1() {
+// CHECK-SPIRV: define spir_func noundef i32 @{{.*test_1.*}}() [[A0:#[0-9]+]] {
+// CHECK-DXIL: define noundef i32 @{{.*test_1.*}}() [[A0:#[0-9]+]] {
+// CHECK-SPIRV: %[[CI:[0-9]+]] = call token @llvm.experimental.convergence.entry()
+// CHECK-SPIRV: call i32 @__hlsl_wave_get_lane_index() [ "convergencectrl"(token %[[CI]]) ]
+// CHECK-DXIL: call i32 @llvm.dx.wave.getlaneindex()
+int test_1() {
   return WaveGetLaneIndex();
 }
 
-// CHECK: declare i32 @__hlsl_wave_get_lane_index() [[A1:#[0-9]+]]
+// CHECK-SPIRV: declare i32 @__hlsl_wave_get_lane_index() [[A1:#[0-9]+]]
+// CHECK-DXIL: declare i32 @llvm.dx.wave.getlaneindex() [[A1:#[0-9]+]]
 
 // CHECK-DAG: attributes [[A0]] = { {{.*}}convergent{{.*}} }
 // CHECK-DAG: attributes [[A1]] = { {{.*}}convergent{{.*}} }
diff --git a/clang/test/SemaHLSL/BuiltIns/WaveGetLaneIndex-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/WaveGetLaneIndex-errors.hlsl
new file mode 100644
index 00000000000000..6208442fab6590
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/WaveGetLaneIndex-errors.hlsl
@@ -0,0 +1,6 @@
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -emit-llvm-only -disable-llvm-passes -verify
+
+int test_too_many_arg(int x) {
+  return __builtin_hlsl_wave_get_lane_index(x);
+  // expected-error@-1 {{too many arguments to function call, expected 0, have 1}}
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
index f2b9e286ebb476..1cf6acbf126475 100644
--- a/llvm/include/llvm/IR/IntrinsicsDirectX.td
+++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -83,6 +83,7 @@ def int_dx_imad : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLV
 def int_dx_umad : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
 def int_dx_normalize : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty], [IntrNoMem]>;
 def int_dx_rsqrt  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+def int_dx_wave_getlaneindex : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrConvergent, IntrNoMem]>;
 def int_dx_wave_is_first_lane : DefaultAttrsIntrinsic<[llvm_i1_ty], [], [IntrConvergent]>;
 def int_dx_sign : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_any_ty], [IntrNoMem]>;
 def int_dx_step : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty, LLVMMatchType<0>], [IntrNoMem]>;
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index 9aa0af3e3a6b17..e8f56b18730d71 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -801,3 +801,12 @@ def WaveIsFirstLane :  DXILOp<110, waveIsFirstLane> {
   let stages = [Stages<DXIL1_0, [all_stages]>];
   let attributes = [Attributes<DXIL1_0, [ReadNone]>];
 }
+
+def WaveGetLaneIndex : DXILOp<111, waveGetLaneIndex> {
+  let Doc = "returns the index of the current lane in the wave";
+  let LLVMIntrinsic = int_dx_wave_getlaneindex;
+  let arguments = [];
+  let result = Int32Ty;
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
diff --git a/llvm/test/CodeGen/DirectX/WaveGetLaneIndex.ll b/llvm/test/CodeGen/DirectX/WaveGetLaneIndex.ll
new file mode 100644
index 00000000000000..86b7ea4f962f77
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/WaveGetLaneIndex.ll
@@ -0,0 +1,10 @@
+; RUN: opt -S  -dxil-op-lower  -mtriple=dxil-pc-shadermodel6.3-compute %s | FileCheck %s
+
+define void @main() {
+entry:
+; CHECK: call i32 @dx.op.waveGetLaneIndex(i32 111)
+  %0 = call i32 @llvm.dx.wave.getlaneindex()
+  ret void
+}
+
+declare i32 @llvm.dx.wave.getlaneindex()

From b800ff67dae59e194c8e9fc5d795a5932dc726f8 Mon Sep 17 00:00:00 2001
From: Donough Liu <ldm2993593805@163.com>
Date: Fri, 11 Oct 2024 02:46:19 +0800
Subject: [PATCH 061/177] [lldb][debugserver][NFC] Simplify macOS thread name
 fetching. (#111684)

Remove unnecessary `proc_pidinfo` calling.
---
 .../debugserver/source/MacOSX/MachThread.cpp  | 48 ++++++++++---------
 .../debugserver/source/MacOSX/MachThread.h    |  8 ++--
 2 files changed, 30 insertions(+), 26 deletions(-)

diff --git a/lldb/tools/debugserver/source/MacOSX/MachThread.cpp b/lldb/tools/debugserver/source/MacOSX/MachThread.cpp
index d34914be802041..de2bebfcec7090 100644
--- a/lldb/tools/debugserver/source/MacOSX/MachThread.cpp
+++ b/lldb/tools/debugserver/source/MacOSX/MachThread.cpp
@@ -31,9 +31,8 @@ MachThread::MachThread(MachProcess *process, bool is_64_bit,
       m_state(eStateUnloaded), m_state_mutex(PTHREAD_MUTEX_RECURSIVE),
       m_suspend_count(0), m_stop_exception(),
       m_arch_up(DNBArchProtocol::Create(this)), m_reg_sets(NULL),
-      m_num_reg_sets(0), m_ident_info(), m_proc_threadinfo(),
-      m_dispatch_queue_name(), m_is_64_bit(is_64_bit),
-      m_pthread_qos_class_decode(nullptr) {
+      m_num_reg_sets(0), m_extended_info(), m_dispatch_queue_name(),
+      m_is_64_bit(is_64_bit), m_pthread_qos_class_decode(nullptr) {
   nub_size_t num_reg_sets = 0;
   m_reg_sets = m_arch_up->GetRegisterSetInfo(&num_reg_sets);
   m_num_reg_sets = num_reg_sets;
@@ -255,7 +254,7 @@ struct thread_basic_info *MachThread::GetBasicInfo() {
 bool MachThread::GetBasicInfo(thread_t thread,
                               struct thread_basic_info *basicInfoPtr) {
   if (MachPortNumberIsValid(thread)) {
-    unsigned int info_count = THREAD_BASIC_INFO_COUNT;
+    mach_msg_type_number_t info_count = THREAD_BASIC_INFO_COUNT;
     kern_return_t err = ::thread_info(thread, THREAD_BASIC_INFO,
                                       (thread_info_t)basicInfoPtr, &info_count);
     if (err == KERN_SUCCESS)
@@ -265,6 +264,26 @@ bool MachThread::GetBasicInfo(thread_t thread,
   return false;
 }
 
+struct thread_extended_info *MachThread::GetExtendedInfo() {
+  if (MachThread::GetExtendedInfo(m_mach_port_number, &m_extended_info))
+    return &m_extended_info;
+  return NULL;
+}
+
+bool MachThread::GetExtendedInfo(thread_t thread,
+                                 struct thread_extended_info *extendedInfoPtr) {
+  if (MachPortNumberIsValid(thread)) {
+    mach_msg_type_number_t info_count = THREAD_EXTENDED_INFO_COUNT;
+    kern_return_t err =
+        ::thread_info(thread, THREAD_EXTENDED_INFO,
+                      (thread_info_t)extendedInfoPtr, &info_count);
+    if (err == KERN_SUCCESS)
+      return true;
+  }
+  ::memset(extendedInfoPtr, 0, sizeof(struct thread_extended_info));
+  return false;
+}
+
 bool MachThread::ThreadIDIsValid(uint64_t thread) { return thread != 0; }
 
 bool MachThread::MachPortNumberIsValid(thread_t thread) {
@@ -579,28 +598,13 @@ uint32_t MachThread::NumSupportedHardwareWatchpoints() const {
   return m_arch_up->NumSupportedHardwareWatchpoints();
 }
 
-bool MachThread::GetIdentifierInfo() {
+const char *MachThread::GetName() {
   // Don't try to get the thread info once and cache it for the life of the
   // thread.  It changes over time, for instance
   // if the thread name changes, then the thread_handle also changes...  So you
   // have to refetch it every time.
-  mach_msg_type_number_t count = THREAD_IDENTIFIER_INFO_COUNT;
-  kern_return_t kret = ::thread_info(m_mach_port_number, THREAD_IDENTIFIER_INFO,
-                                     (thread_info_t)&m_ident_info, &count);
-  return kret == KERN_SUCCESS;
-
-  return false;
-}
-
-const char *MachThread::GetName() {
-  if (GetIdentifierInfo()) {
-    int len = ::proc_pidinfo(m_process->ProcessID(), PROC_PIDTHREADINFO,
-                             m_ident_info.thread_handle, &m_proc_threadinfo,
-                             sizeof(m_proc_threadinfo));
-
-    if (len && m_proc_threadinfo.pth_name[0])
-      return m_proc_threadinfo.pth_name;
-  }
+  if (GetExtendedInfo() && m_extended_info.pth_name[0])
+    return m_extended_info.pth_name;
   return NULL;
 }
 
diff --git a/lldb/tools/debugserver/source/MacOSX/MachThread.h b/lldb/tools/debugserver/source/MacOSX/MachThread.h
index 5466c6f9f95095..0c78ef1a337ed3 100644
--- a/lldb/tools/debugserver/source/MacOSX/MachThread.h
+++ b/lldb/tools/debugserver/source/MacOSX/MachThread.h
@@ -108,6 +108,7 @@ class MachThread {
 
   bool IsUserReady();
   struct thread_basic_info *GetBasicInfo();
+  struct thread_extended_info *GetExtendedInfo();
   const char *GetBasicInfoAsString() const;
   const char *GetName();
 
@@ -126,8 +127,8 @@ class MachThread {
 protected:
   static bool GetBasicInfo(thread_t threadID,
                            struct thread_basic_info *basic_info);
-
-  bool GetIdentifierInfo();
+  static bool GetExtendedInfo(thread_t threadID,
+                              struct thread_extended_info *extended_info);
 
   //    const char *
   //    GetDispatchQueueName();
@@ -152,8 +153,7 @@ class MachThread {
   const DNBRegisterSetInfo
       *m_reg_sets; // Register set information for this thread
   nub_size_t m_num_reg_sets;
-  thread_identifier_info_data_t m_ident_info;
-  struct proc_threadinfo m_proc_threadinfo;
+  thread_extended_info_data_t m_extended_info;
   std::string m_dispatch_queue_name;
   bool m_is_64_bit;
 

From c2063de1593610eda0f4de33c3b89324642ed54c Mon Sep 17 00:00:00 2001
From: Greg Roth <grroth@microsoft.com>
Date: Thu, 10 Oct 2024 12:58:28 -0600
Subject: [PATCH 062/177] Switch DirectX Target to use the Itanium ABI
 (#111632)

To consolidate behavior of function mangling and limit the number of
places that ABI changes will need to be made, this switches the DirectX
target used for HLSL to use the Itanium ABI from the Microsoft ABI. The
Itanium ABI has greater flexibility in decisions regarding mangling of
new types of which we have more than a few yet to add.

One effect of this will be that linking library shaders compiled with
DXC will not be possible with shaders compiled with clang. That isn't
considered a terribly interesting use case and one that would likely
have been onerous to maintain anyway.

This involved adding a function to call all global destructors as the
Microsoft ABI had done.

This requires a few changes to tests. Most notably the mangling style
has changed which accounts for most of the changes. In making those
changes, I took the opportunity to harmonize some very similar tests for
greater consistency. I also shaved off some unneeded run flags that had
probably been copied over from one test to another.

Other changes effected by using the new ABI include using different
types when manipulating smaller bitfields, eliminating an unnecessary
alloca in one instance in this-assignment.hlsl, changing the way static
local initialization is guarded, and changing the order of inout
parameters getting copied in and out. That last is a subtle change in
functionality, but one where there was sufficient inconsistency in the
past that standardizing is important, but the particular direction of
the standardization is less important for the sake of existing shaders.

fixes #110736
---
 clang/lib/Basic/Targets/DirectX.h             |   2 +-
 clang/lib/CodeGen/ItaniumCXXABI.cpp           |   4 +
 clang/test/CodeGenHLSL/ArrayTemporary.hlsl    |   8 +-
 .../BasicFeatures/OutputArguments.hlsl        |  26 ++--
 .../GlobalConstructorFunction.hlsl            |   8 +-
 .../CodeGenHLSL/GlobalConstructorLib.hlsl     |  12 +-
 .../test/CodeGenHLSL/GlobalConstructors.hlsl  |   2 +-
 clang/test/CodeGenHLSL/GlobalDestructors.hlsl |  14 +-
 clang/test/CodeGenHLSL/basic_types.hlsl       |  64 ++++-----
 .../builtins/RWBuffer-annotations.hlsl        |  12 +-
 .../builtins/RWBuffer-elementtype.hlsl        |  26 ++--
 .../RasterizerOrderedBuffer-annotations.hlsl  |  12 +-
 .../StructuredBuffer-annotations.hlsl         |  12 +-
 .../StructuredBuffer-elementtype.hlsl         |  26 ++--
 clang/test/CodeGenHLSL/builtins/abs.hlsl      |  73 ++++++-----
 clang/test/CodeGenHLSL/builtins/ceil.hlsl     |  37 +++---
 clang/test/CodeGenHLSL/builtins/clamp.hlsl    | 101 +++++++-------
 clang/test/CodeGenHLSL/builtins/cos.hlsl      |  37 +++---
 clang/test/CodeGenHLSL/builtins/exp.hlsl      |  37 +++---
 clang/test/CodeGenHLSL/builtins/exp2.hlsl     |  37 +++---
 clang/test/CodeGenHLSL/builtins/floor.hlsl    |  37 +++---
 .../CodeGenHLSL/builtins/hlsl_resource_t.hlsl |   4 +-
 clang/test/CodeGenHLSL/builtins/log.hlsl      |  37 +++---
 clang/test/CodeGenHLSL/builtins/log10.hlsl    |  37 +++---
 clang/test/CodeGenHLSL/builtins/log2.hlsl     |  37 +++---
 clang/test/CodeGenHLSL/builtins/max.hlsl      | 101 +++++++-------
 clang/test/CodeGenHLSL/builtins/min.hlsl      | 101 +++++++-------
 clang/test/CodeGenHLSL/builtins/pow.hlsl      |  37 +++---
 clang/test/CodeGenHLSL/builtins/round.hlsl    |  37 +++---
 clang/test/CodeGenHLSL/builtins/saturate.hlsl | 123 +++++++-----------
 clang/test/CodeGenHLSL/builtins/sin.hlsl      |  37 +++---
 clang/test/CodeGenHLSL/builtins/sqrt.hlsl     |  37 +++---
 clang/test/CodeGenHLSL/builtins/trunc.hlsl    |  39 +++---
 clang/test/CodeGenHLSL/export.hlsl            |  11 +-
 clang/test/CodeGenHLSL/float3.hlsl            |   2 +-
 clang/test/CodeGenHLSL/group_shared.hlsl      |   2 +-
 clang/test/CodeGenHLSL/half.hlsl              |   4 +-
 .../implicit-norecurse-attrib.hlsl            |   8 +-
 .../test/CodeGenHLSL/inline-constructors.hlsl |   4 +-
 clang/test/CodeGenHLSL/inline-functions.hlsl  |  10 +-
 .../semantics/GroupIndex-codegen.hlsl         |   2 +-
 clang/test/CodeGenHLSL/shift-mask.hlsl        |  43 +++++-
 clang/test/CodeGenHLSL/sret_output.hlsl       |   7 +-
 clang/test/CodeGenHLSL/static-local-ctor.hlsl |  14 +-
 .../static_global_and_function_in_cb.hlsl     |   7 +-
 .../CodeGenHLSL/this-assignment-overload.hlsl |   8 +-
 clang/test/CodeGenHLSL/this-assignment.hlsl   |   7 +-
 clang/test/CodeGenHLSL/this-reference.hlsl    |   4 +-
 48 files changed, 667 insertions(+), 680 deletions(-)

diff --git a/clang/lib/Basic/Targets/DirectX.h b/clang/lib/Basic/Targets/DirectX.h
index cf7ea5e83503dc..19b61252409b09 100644
--- a/clang/lib/Basic/Targets/DirectX.h
+++ b/clang/lib/Basic/Targets/DirectX.h
@@ -62,7 +62,7 @@ class LLVM_LIBRARY_VISIBILITY DirectXTargetInfo : public TargetInfo {
     PlatformName = llvm::Triple::getOSTypeName(Triple.getOS());
     resetDataLayout("e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:"
                     "32-f64:64-n8:16:32:64");
-    TheCXXABI.set(TargetCXXABI::Microsoft);
+    TheCXXABI.set(TargetCXXABI::GenericItanium);
   }
   bool useFP16ConversionIntrinsics() const override { return false; }
   void getTargetDefines(const LangOptions &Opts,
diff --git a/clang/lib/CodeGen/ItaniumCXXABI.cpp b/clang/lib/CodeGen/ItaniumCXXABI.cpp
index 965e09a7a760ec..75dab596e1b2c4 100644
--- a/clang/lib/CodeGen/ItaniumCXXABI.cpp
+++ b/clang/lib/CodeGen/ItaniumCXXABI.cpp
@@ -2997,6 +2997,10 @@ void ItaniumCXXABI::registerGlobalDtor(CodeGenFunction &CGF, const VarDecl &D,
   if (D.isNoDestroy(CGM.getContext()))
     return;
 
+  // HLSL doesn't support atexit.
+  if (CGM.getLangOpts().HLSL)
+    return CGM.AddCXXDtorEntry(dtor, addr);
+
   // OpenMP offloading supports C++ constructors and destructors but we do not
   // always have 'atexit' available. Instead lower these to use the LLVM global
   // destructors which we can handle directly in the runtime. Note that this is
diff --git a/clang/test/CodeGenHLSL/ArrayTemporary.hlsl b/clang/test/CodeGenHLSL/ArrayTemporary.hlsl
index 63a30b61440eb5..7d77c0aff736cc 100644
--- a/clang/test/CodeGenHLSL/ArrayTemporary.hlsl
+++ b/clang/test/CodeGenHLSL/ArrayTemporary.hlsl
@@ -68,11 +68,11 @@ void call4(float Arr[2][2]) {
 // CHECK: [[Tmp2:%.*]] = alloca [4 x float]
 // CHECK: [[Tmp3:%.*]] = alloca [3 x i32]
 // CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp1]], ptr align 4 [[FA2]], i32 8, i1 false)
-// CHECK: call void @"??$template_fn@$$BY01M@@YAXY01M@Z"(ptr noundef byval([2 x float]) align 4 [[Tmp1]])
+// CHECK: call void @_Z11template_fnIA2_fEvT_(ptr noundef byval([2 x float]) align 4 [[Tmp1]])
 // CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp2]], ptr align 4 [[FA4]], i32 16, i1 false)
-// CHECK: call void @"??$template_fn@$$BY03M@@YAXY03M@Z"(ptr noundef byval([4 x float]) align 4 [[Tmp2]])
+// CHECK: call void @_Z11template_fnIA4_fEvT_(ptr noundef byval([4 x float]) align 4 [[Tmp2]])
 // CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp3]], ptr align 4 [[IA3]], i32 12, i1 false)
-// CHECK: call void @"??$template_fn@$$BY02H@@YAXY02H@Z"(ptr noundef byval([3 x i32]) align 4 [[Tmp3]])
+// CHECK: call void @_Z11template_fnIA3_iEvT_(ptr noundef byval([3 x i32]) align 4 [[Tmp3]])
 
 template<typename T>
 void template_fn(T Val) {}
@@ -90,7 +90,7 @@ void template_call(float FA2[2], float FA4[4], int IA3[3]) {
 
 // CHECK: [[Addr:%.*]] = getelementptr inbounds [2 x float], ptr [[FA2]], i32 0, i32 0
 // CHECK: [[Tmp:%.*]] = load float, ptr [[Addr]]
-// CHECK: call void @"??$template_fn@M@@YAXM@Z"(float noundef [[Tmp]])
+// CHECK: call void @_Z11template_fnIfEvT_(float noundef [[Tmp]])
 
 // CHECK: [[Idx0:%.*]] = getelementptr inbounds [2 x float], ptr [[FA2]], i32 0, i32 0
 // CHECK: [[Val0:%.*]] = load float, ptr [[Idx0]]
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/OutputArguments.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/OutputArguments.hlsl
index 58237889db1dca..6afead4f233660 100644
--- a/clang/test/CodeGenHLSL/BasicFeatures/OutputArguments.hlsl
+++ b/clang/test/CodeGenHLSL/BasicFeatures/OutputArguments.hlsl
@@ -260,10 +260,10 @@ void order_matters(inout int X, inout int Y) {
 // CHECK: store i32 [[VVal]], ptr [[Tmp0]]
 // CHECK: [[VVal:%.*]] = load i32, ptr [[V]]
 // CHECK: store i32 [[VVal]], ptr [[Tmp1]]
-// CHECK: call void {{.*}}order_matters{{.*}}(ptr noalias noundef nonnull align 4 dereferenceable(4) [[Tmp1]], ptr noalias noundef nonnull align 4 dereferenceable(4) [[Tmp0]])
-// CHECK: [[Arg1Val:%.*]] = load i32, ptr [[Tmp1]]
+// CHECK: call void {{.*}}order_matters{{.*}}(ptr noalias noundef nonnull align 4 dereferenceable(4) [[Tmp0]], ptr noalias noundef nonnull align 4 dereferenceable(4) [[Tmp1]])
+// CHECK: [[Arg1Val:%.*]] = load i32, ptr [[Tmp0]]
 // CHECK: store i32 [[Arg1Val]], ptr [[V]]
-// CHECK: [[Arg2Val:%.*]] = load i32, ptr [[Tmp0]]
+// CHECK: [[Arg2Val:%.*]] = load i32, ptr [[Tmp1]]
 // CHECK: store i32 [[Arg2Val]], ptr [[V]]
 
 // OPT: ret i32 2
@@ -289,17 +289,19 @@ void setFour(inout int I) {
 // CHECK: [[B:%.*]] = alloca %struct.B
 // CHECK: [[Tmp:%.*]] = alloca i32
 
-// CHECK: [[BFLoad:%.*]] = load i32, ptr [[B]]
-// CHECK: [[BFshl:%.*]] = shl i32 [[BFLoad]], 24
-// CHECK: [[BFashr:%.*]] = ashr i32 [[BFshl]], 24
-// CHECK: store i32 [[BFashr]], ptr [[Tmp]]
+// CHECK: [[BFLoad:%.*]] = load i16, ptr [[B]]
+// CHECK: [[BFshl:%.*]] = shl i16 [[BFLoad]], 8
+// CHECK: [[BFashr:%.*]] = ashr i16 [[BFshl]], 8
+// CHECK: [[BFcast:%.*]] = sext i16 [[BFashr]] to i32
+// CHECK: store i32 [[BFcast]], ptr [[Tmp]]
 // CHECK: call void {{.*}}setFour{{.*}}(ptr noalias noundef nonnull align 4 dereferenceable(4) [[Tmp]])
 // CHECK: [[RetVal:%.*]] = load i32, ptr [[Tmp]]
-// CHECK: [[BFLoad:%.*]] = load i32, ptr [[B]]
-// CHECK: [[BFValue:%.*]] = and i32 [[RetVal]], 255
-// CHECK: [[ZerodField:%.*]] = and i32 [[BFLoad]], -256
-// CHECK: [[BFSet:%.*]] = or i32 [[ZerodField]], [[BFValue]]
-// CHECK: store i32 [[BFSet]], ptr [[B]]
+// CHECK: [[TruncVal:%.*]] = trunc i32 [[RetVal]] to i16
+// CHECK: [[BFLoad:%.*]] = load i16, ptr [[B]]
+// CHECK: [[BFValue:%.*]] = and i16 [[TruncVal]], 255
+// CHECK: [[ZerodField:%.*]] = and i16 [[BFLoad]], -256
+// CHECK: [[BFSet:%.*]] = or i16 [[ZerodField]], [[BFValue]]
+// CHECK: store i16 [[BFSet]], ptr [[B]]
 
 // OPT: ret i32 8
 export int case11() {
diff --git a/clang/test/CodeGenHLSL/GlobalConstructorFunction.hlsl b/clang/test/CodeGenHLSL/GlobalConstructorFunction.hlsl
index b39311ad67cd62..c0eb1b138ed047 100644
--- a/clang/test/CodeGenHLSL/GlobalConstructorFunction.hlsl
+++ b/clang/test/CodeGenHLSL/GlobalConstructorFunction.hlsl
@@ -25,11 +25,11 @@ void main(unsigned GI : SV_GroupIndex) {}
 // CHECK: define void @main()
 // CHECK-NEXT: entry:
 // Verify function constructors are emitted
-// NOINLINE-NEXT:   call void @"?call_me_first@@YAXXZ"()
-// NOINLINE-NEXT:   call void @"?then_call_me@@YAXXZ"()
+// NOINLINE-NEXT:   call void @_Z13call_me_firstv()
+// NOINLINE-NEXT:   call void @_Z12then_call_mev()
 // NOINLINE-NEXT:   %0 = call i32 @llvm.dx.flattened.thread.id.in.group()
-// NOINLINE-NEXT:   call void @"?main@@YAXI@Z"(i32 %0)
-// NOINLINE-NEXT:   call void @"?call_me_last@@YAXXZ"(
+// NOINLINE-NEXT:   call void @_Z4mainj(i32 %0)
+// NOINLINE-NEXT:   call void @_Z12call_me_lastv(
 // NOINLINE-NEXT:   ret void
 
 // Verify constructor calls are inlined when AlwaysInline is run
diff --git a/clang/test/CodeGenHLSL/GlobalConstructorLib.hlsl b/clang/test/CodeGenHLSL/GlobalConstructorLib.hlsl
index 78f6475462bc47..09c44f6242c53c 100644
--- a/clang/test/CodeGenHLSL/GlobalConstructorLib.hlsl
+++ b/clang/test/CodeGenHLSL/GlobalConstructorLib.hlsl
@@ -13,7 +13,7 @@ void FirstEntry() {}
 // CHECK: define void @FirstEntry()
 // CHECK-NEXT: entry:
 // NOINLINE-NEXT:   call void @_GLOBAL__sub_I_GlobalConstructorLib.hlsl()
-// NOINLINE-NEXT:   call void @"?FirstEntry@@YAXXZ"()
+// NOINLINE-NEXT:   call void @_Z10FirstEntryv()
 // Verify inlining leaves only calls to "llvm." intrinsics
 // INLINE-NOT:   call {{[^@]*}} @{{[^l][^l][^v][^m][^\.]}}
 // CHECK: ret void
@@ -25,7 +25,7 @@ void SecondEntry() {}
 // CHECK: define void @SecondEntry()
 // CHECK-NEXT: entry:
 // NOINLINE-NEXT:   call void @_GLOBAL__sub_I_GlobalConstructorLib.hlsl()
-// NOINLINE-NEXT:   call void @"?SecondEntry@@YAXXZ"()
+// NOINLINE-NEXT:   call void @_Z11SecondEntryv()
 // Verify inlining leaves only calls to "llvm." intrinsics
 // INLINE-NOT:   call {{[^@]*}} @{{[^l][^l][^v][^m][^\.]}}
 // CHECK: ret void
@@ -33,6 +33,10 @@ void SecondEntry() {}
 
 // Verify the constructor is alwaysinline
 // NOINLINE: ; Function Attrs: {{.*}}alwaysinline
-// NOINLINE-NEXT: define internal void @_GLOBAL__sub_I_GlobalConstructorLib.hlsl() [[IntAttr:\#[0-9]+]]
+// NOINLINE-NEXT: define linkonce_odr void @_ZN4hlsl8RWBufferIfEC2Ev({{.*}} [[CtorAttr:\#[0-9]+]]
 
-// NOINLINE: attributes [[IntAttr]] = {{.*}} alwaysinline
+// NOINLINE: ; Function Attrs: {{.*}}alwaysinline
+// NOINLINE-NEXT: define internal void @_GLOBAL__sub_I_GlobalConstructorLib.hlsl() [[InitAttr:\#[0-9]+]]
+
+// NOINLINE-DAG: attributes [[InitAttr]] = {{.*}} alwaysinline
+// NOINLINE-DAG: attributes [[CtorAttr]] = {{.*}} alwaysinline
diff --git a/clang/test/CodeGenHLSL/GlobalConstructors.hlsl b/clang/test/CodeGenHLSL/GlobalConstructors.hlsl
index 7e2f288726c954..7b26dba0d19010 100644
--- a/clang/test/CodeGenHLSL/GlobalConstructors.hlsl
+++ b/clang/test/CodeGenHLSL/GlobalConstructors.hlsl
@@ -12,5 +12,5 @@ void main(unsigned GI : SV_GroupIndex) {}
 //CHECK-NEXT: entry:
 //CHECK-NEXT:   call void @_GLOBAL__sub_I_GlobalConstructors.hlsl()
 //CHECK-NEXT:   %0 = call i32 @llvm.dx.flattened.thread.id.in.group()
-//CHECK-NEXT:   call void @"?main@@YAXI@Z"(i32 %0)
+//CHECK-NEXT:   call void @_Z4mainj(i32 %0)
 //CHECK-NEXT:   ret void
diff --git a/clang/test/CodeGenHLSL/GlobalDestructors.hlsl b/clang/test/CodeGenHLSL/GlobalDestructors.hlsl
index ea28354222f885..f98318601134bb 100644
--- a/clang/test/CodeGenHLSL/GlobalDestructors.hlsl
+++ b/clang/test/CodeGenHLSL/GlobalDestructors.hlsl
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -std=hlsl202x -emit-llvm -disable-llvm-passes %s -o - | FileCheck %s --check-prefixes=CS,NOINLINE,CHECK
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -std=hlsl202x -emit-llvm -disable-llvm-passes %s -o - | FileCheck %s --check-prefixes=LIB,NOINLINE,CHECK
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -std=hlsl202x -emit-llvm -O0 %s -o - | FileCheck %s --check-prefixes=INLINE,CHECK
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -std=hlsl202x -emit-llvm -O0 %s -o - | FileCheck %s --check-prefixes=INLINE,CHECK
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -emit-llvm -disable-llvm-passes %s -o - | FileCheck %s --check-prefixes=CS,NOINLINE,CHECK
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -emit-llvm -disable-llvm-passes %s -o - | FileCheck %s --check-prefixes=LIB,NOINLINE,CHECK
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -emit-llvm -O0 %s -o - | FileCheck %s --check-prefixes=INLINE,CHECK
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -emit-llvm -O0 %s -o - | FileCheck %s --check-prefixes=INLINE,CHECK
 
 // Tests that constructors and destructors are appropriately generated for globals
 // and that their calls are inlined when AlwaysInline is run
@@ -59,7 +59,7 @@ void main(unsigned GI : SV_GroupIndex) {
 // Verify destructor is emitted
 // NOINLINE-NEXT:   call void @_GLOBAL__sub_I_GlobalDestructors.hlsl()
 // NOINLINE-NEXT:   %0 = call i32 @llvm.dx.flattened.thread.id.in.group()
-// NOINLINE-NEXT:   call void @"?main@@YAXI@Z"(i32 %0)
+// NOINLINE-NEXT:   call void @_Z4mainj(i32 %0)
 // NOINLINE-NEXT:   call void @_GLOBAL__D_a()
 // NOINLINE-NEXT:   ret void
 // Verify inlining leaves only calls to "llvm." intrinsics
@@ -71,8 +71,8 @@ void main(unsigned GI : SV_GroupIndex) {
 
 // NOINLINE: define internal void @_GLOBAL__D_a() [[IntAttr:\#[0-9]+]]
 // NOINLINE-NEXT: entry:
-// NOINLINE-NEXT:   call void @"??1Tail@@QAA@XZ"(ptr @"?T@?1??Wag@@YAXXZ@4UTail@@A")
-// NOINLINE-NEXT:   call void @"??1Pupper@@QAA@XZ"(ptr @"?GlobalPup@@3UPupper@@A")
+// NOINLINE-NEXT:   call void @_ZN4TailD1Ev(ptr @_ZZ3WagvE1T)
+// NOINLINE-NEXT:   call void @_ZN6PupperD1Ev(ptr @GlobalPup)
 // NOINLINE-NEXT:   ret void
 
 // NOINLINE: attributes [[IntAttr]] = {{.*}} alwaysinline
diff --git a/clang/test/CodeGenHLSL/basic_types.hlsl b/clang/test/CodeGenHLSL/basic_types.hlsl
index 15c963dfa666f4..d987af45a649fb 100644
--- a/clang/test/CodeGenHLSL/basic_types.hlsl
+++ b/clang/test/CodeGenHLSL/basic_types.hlsl
@@ -6,38 +6,38 @@
 // RUN:   -emit-llvm -disable-llvm-passes -o - -DNAMESPACED| FileCheck %s
 
 
-// CHECK:"?uint16_t_Val@@3GA" = global i16 0, align 2
-// CHECK:"?int16_t_Val@@3FA" = global i16 0, align 2
-// CHECK:"?uint_Val@@3IA" = global i32 0, align 4
-// CHECK:"?uint64_t_Val@@3KA" = global i64 0, align 8
-// CHECK:"?int64_t_Val@@3JA" = global i64 0, align 8
-// CHECK:"?int16_t2_Val@@3T?$__vector@F$01@__clang@@A" = global <2 x i16> zeroinitializer, align 4
-// CHECK:"?int16_t3_Val@@3T?$__vector@F$02@__clang@@A" = global <3 x i16> zeroinitializer, align 8
-// CHECK:"?int16_t4_Val@@3T?$__vector@F$03@__clang@@A" = global <4 x i16> zeroinitializer, align 8
-// CHECK:"?uint16_t2_Val@@3T?$__vector@G$01@__clang@@A" = global <2 x i16> zeroinitializer, align 4
-// CHECK:"?uint16_t3_Val@@3T?$__vector@G$02@__clang@@A" = global <3 x i16> zeroinitializer, align 8
-// CHECK:"?uint16_t4_Val@@3T?$__vector@G$03@__clang@@A" = global <4 x i16> zeroinitializer, align 8
-// CHECK:"?int2_Val@@3T?$__vector@H$01@__clang@@A" = global <2 x i32> zeroinitializer, align 8
-// CHECK:"?int3_Val@@3T?$__vector@H$02@__clang@@A" = global <3 x i32> zeroinitializer, align 16
-// CHECK:"?int4_Val@@3T?$__vector@H$03@__clang@@A" = global <4 x i32> zeroinitializer, align 16
-// CHECK:"?uint2_Val@@3T?$__vector@I$01@__clang@@A" = global <2 x i32> zeroinitializer, align 8
-// CHECK:"?uint3_Val@@3T?$__vector@I$02@__clang@@A" = global <3 x i32> zeroinitializer, align 16
-// CHECK:"?uint4_Val@@3T?$__vector@I$03@__clang@@A" = global <4 x i32> zeroinitializer, align 16
-// CHECK:"?int64_t2_Val@@3T?$__vector@J$01@__clang@@A" = global <2 x i64> zeroinitializer, align 16
-// CHECK:"?int64_t3_Val@@3T?$__vector@J$02@__clang@@A" = global <3 x i64> zeroinitializer, align 32
-// CHECK:"?int64_t4_Val@@3T?$__vector@J$03@__clang@@A" = global <4 x i64> zeroinitializer, align 32
-// CHECK:"?uint64_t2_Val@@3T?$__vector@K$01@__clang@@A" = global <2 x i64> zeroinitializer, align 16
-// CHECK:"?uint64_t3_Val@@3T?$__vector@K$02@__clang@@A" = global <3 x i64> zeroinitializer, align 32
-// CHECK:"?uint64_t4_Val@@3T?$__vector@K$03@__clang@@A" = global <4 x i64> zeroinitializer, align 32
-// CHECK:"?half2_Val@@3T?$__vector@$f16@$01@__clang@@A" = global <2 x half> zeroinitializer, align 4
-// CHECK:"?half3_Val@@3T?$__vector@$f16@$02@__clang@@A" = global <3 x half> zeroinitializer, align 8
-// CHECK:"?half4_Val@@3T?$__vector@$f16@$03@__clang@@A" = global <4 x half> zeroinitializer, align 8
-// CHECK:"?float2_Val@@3T?$__vector@M$01@__clang@@A" = global <2 x float> zeroinitializer, align 8
-// CHECK:"?float3_Val@@3T?$__vector@M$02@__clang@@A" = global <3 x float> zeroinitializer, align 16
-// CHECK:"?float4_Val@@3T?$__vector@M$03@__clang@@A" = global <4 x float> zeroinitializer, align 16
-// CHECK:"?double2_Val@@3T?$__vector@N$01@__clang@@A" = global <2 x double> zeroinitializer, align 16
-// CHECK:"?double3_Val@@3T?$__vector@N$02@__clang@@A" = global <3 x double> zeroinitializer, align 32
-// CHECK:"?double4_Val@@3T?$__vector@N$03@__clang@@A" = global <4 x double> zeroinitializer, align 32
+// CHECK: @uint16_t_Val = global i16 0, align 2
+// CHECK: @int16_t_Val = global i16 0, align 2
+// CHECK: @uint_Val = global i32 0, align 4
+// CHECK: @uint64_t_Val = global i64 0, align 8
+// CHECK: @int64_t_Val = global i64 0, align 8
+// CHECK: @int16_t2_Val = global <2 x i16> zeroinitializer, align 4
+// CHECK: @int16_t3_Val = global <3 x i16> zeroinitializer, align 8
+// CHECK: @int16_t4_Val = global <4 x i16> zeroinitializer, align 8
+// CHECK: @uint16_t2_Val = global <2 x i16> zeroinitializer, align 4
+// CHECK: @uint16_t3_Val = global <3 x i16> zeroinitializer, align 8
+// CHECK: @uint16_t4_Val = global <4 x i16> zeroinitializer, align 8
+// CHECK: @int2_Val = global <2 x i32> zeroinitializer, align 8
+// CHECK: @int3_Val = global <3 x i32> zeroinitializer, align 16
+// CHECK: @int4_Val = global <4 x i32> zeroinitializer, align 16
+// CHECK: @uint2_Val = global <2 x i32> zeroinitializer, align 8
+// CHECK: @uint3_Val = global <3 x i32> zeroinitializer, align 16
+// CHECK: @uint4_Val = global <4 x i32> zeroinitializer, align 16
+// CHECK: @int64_t2_Val = global <2 x i64> zeroinitializer, align 16
+// CHECK: @int64_t3_Val = global <3 x i64> zeroinitializer, align 32
+// CHECK: @int64_t4_Val = global <4 x i64> zeroinitializer, align 32
+// CHECK: @uint64_t2_Val = global <2 x i64> zeroinitializer, align 16
+// CHECK: @uint64_t3_Val = global <3 x i64> zeroinitializer, align 32
+// CHECK: @uint64_t4_Val = global <4 x i64> zeroinitializer, align 32
+// CHECK: @half2_Val = global <2 x half> zeroinitializer, align 4
+// CHECK: @half3_Val = global <3 x half> zeroinitializer, align 8
+// CHECK: @half4_Val = global <4 x half> zeroinitializer, align 8
+// CHECK: @float2_Val = global <2 x float> zeroinitializer, align 8
+// CHECK: @float3_Val = global <3 x float> zeroinitializer, align 16
+// CHECK: @float4_Val = global <4 x float> zeroinitializer, align 16
+// CHECK: @double2_Val = global <2 x double> zeroinitializer, align 16
+// CHECK: @double3_Val = global <3 x double> zeroinitializer, align 32
+// CHECK: @double4_Val = global <4 x double> zeroinitializer, align 32
 
 #ifdef NAMESPACED
 #define TYPE_DECL(T)  hlsl::T T##_Val
diff --git a/clang/test/CodeGenHLSL/builtins/RWBuffer-annotations.hlsl b/clang/test/CodeGenHLSL/builtins/RWBuffer-annotations.hlsl
index 7ca78e60fb9c59..e1e047485e4df0 100644
--- a/clang/test/CodeGenHLSL/builtins/RWBuffer-annotations.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/RWBuffer-annotations.hlsl
@@ -16,9 +16,9 @@ void main() {
 }
 
 // CHECK: !hlsl.uavs = !{![[Single:[0-9]+]], ![[Array:[0-9]+]], ![[SingleAllocated:[0-9]+]], ![[ArrayAllocated:[0-9]+]], ![[SingleSpace:[0-9]+]], ![[ArraySpace:[0-9]+]]}
-// CHECK-DAG: ![[Single]] = !{ptr @"?Buffer1@@3V?$RWBuffer@M@hlsl@@A", i32 10, i32 9, i1 false, i32 -1, i32 0}
-// CHECK-DAG: ![[Array]] = !{ptr @"?BufferArray@@3PAV?$RWBuffer@T?$__vector@M$03@__clang@@@hlsl@@A", i32 10, i32 9, i1 false, i32 -1, i32 0}
-// CHECK-DAG: ![[SingleAllocated]] = !{ptr @"?Buffer2@@3V?$RWBuffer@M@hlsl@@A", i32 10, i32 9, i1 false, i32 3, i32 0}
-// CHECK-DAG: ![[ArrayAllocated]] = !{ptr @"?BufferArray2@@3PAV?$RWBuffer@T?$__vector@M$03@__clang@@@hlsl@@A", i32 10, i32 9, i1 false, i32 4, i32 0}
-// CHECK-DAG: ![[SingleSpace]] = !{ptr @"?Buffer3@@3V?$RWBuffer@M@hlsl@@A", i32 10, i32 9, i1 false, i32 3, i32 1}
-// CHECK-DAG: ![[ArraySpace]] = !{ptr @"?BufferArray3@@3PAV?$RWBuffer@T?$__vector@M$03@__clang@@@hlsl@@A", i32 10, i32 9, i1 false, i32 4, i32 1}
+// CHECK-DAG: ![[Single]] = !{ptr @Buffer1, i32 10, i32 9, i1 false, i32 -1, i32 0}
+// CHECK-DAG: ![[Array]] = !{ptr @BufferArray, i32 10, i32 9, i1 false, i32 -1, i32 0}
+// CHECK-DAG: ![[SingleAllocated]] = !{ptr @Buffer2, i32 10, i32 9, i1 false, i32 3, i32 0}
+// CHECK-DAG: ![[ArrayAllocated]] = !{ptr @BufferArray2, i32 10, i32 9, i1 false, i32 4, i32 0}
+// CHECK-DAG: ![[SingleSpace]] = !{ptr @Buffer3, i32 10, i32 9, i1 false, i32 3, i32 1}
+// CHECK-DAG: ![[ArraySpace]] = !{ptr @BufferArray3, i32 10, i32 9, i1 false, i32 4, i32 1}
diff --git a/clang/test/CodeGenHLSL/builtins/RWBuffer-elementtype.hlsl b/clang/test/CodeGenHLSL/builtins/RWBuffer-elementtype.hlsl
index 036c9c28ef2779..eca4f1598fd658 100644
--- a/clang/test/CodeGenHLSL/builtins/RWBuffer-elementtype.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/RWBuffer-elementtype.hlsl
@@ -37,16 +37,16 @@ void main(int GI : SV_GroupIndex) {
   BufF32x3[GI] = 0;
 }
 
-// CHECK: !{{[0-9]+}} = !{ptr @"?BufI16@@3V?$RWBuffer@F@hlsl@@A", i32 10, i32 2,
-// CHECK: !{{[0-9]+}} = !{ptr @"?BufU16@@3V?$RWBuffer@G@hlsl@@A", i32 10, i32 3,
-// CHECK: !{{[0-9]+}} = !{ptr @"?BufI32@@3V?$RWBuffer@H@hlsl@@A", i32 10, i32 4,
-// CHECK: !{{[0-9]+}} = !{ptr @"?BufU32@@3V?$RWBuffer@I@hlsl@@A", i32 10, i32 5,
-// CHECK: !{{[0-9]+}} = !{ptr @"?BufI64@@3V?$RWBuffer@J@hlsl@@A", i32 10, i32 6,
-// CHECK: !{{[0-9]+}} = !{ptr @"?BufU64@@3V?$RWBuffer@K@hlsl@@A", i32 10, i32 7,
-// CHECK: !{{[0-9]+}} = !{ptr @"?BufF16@@3V?$RWBuffer@$f16@@hlsl@@A", i32 10, i32 8,
-// CHECK: !{{[0-9]+}} = !{ptr @"?BufF32@@3V?$RWBuffer@M@hlsl@@A", i32 10, i32 9,
-// CHECK: !{{[0-9]+}} = !{ptr @"?BufF64@@3V?$RWBuffer@N@hlsl@@A", i32 10, i32 10,
-// CHECK: !{{[0-9]+}} = !{ptr @"?BufI16x4@@3V?$RWBuffer@T?$__vector@F$03@__clang@@@hlsl@@A", i32 10, i32 2,
-// CHECK: !{{[0-9]+}} = !{ptr @"?BufU32x3@@3V?$RWBuffer@T?$__vector@I$02@__clang@@@hlsl@@A", i32 10, i32 5,
-// CHECK: !{{[0-9]+}} = !{ptr @"?BufF16x2@@3V?$RWBuffer@T?$__vector@$f16@$01@__clang@@@hlsl@@A", i32 10, i32 8,
-// CHECK: !{{[0-9]+}} = !{ptr @"?BufF32x3@@3V?$RWBuffer@T?$__vector@M$02@__clang@@@hlsl@@A", i32 10, i32 9,
+// CHECK: !{{[0-9]+}} = !{ptr @BufI16, i32 10, i32 2,
+// CHECK: !{{[0-9]+}} = !{ptr @BufU16, i32 10, i32 3,
+// CHECK: !{{[0-9]+}} = !{ptr @BufI32, i32 10, i32 4,
+// CHECK: !{{[0-9]+}} = !{ptr @BufU32, i32 10, i32 5,
+// CHECK: !{{[0-9]+}} = !{ptr @BufI64, i32 10, i32 6,
+// CHECK: !{{[0-9]+}} = !{ptr @BufU64, i32 10, i32 7,
+// CHECK: !{{[0-9]+}} = !{ptr @BufF16, i32 10, i32 8,
+// CHECK: !{{[0-9]+}} = !{ptr @BufF32, i32 10, i32 9,
+// CHECK: !{{[0-9]+}} = !{ptr @BufF64, i32 10, i32 10,
+// CHECK: !{{[0-9]+}} = !{ptr @BufI16x4, i32 10, i32 2,
+// CHECK: !{{[0-9]+}} = !{ptr @BufU32x3, i32 10, i32 5,
+// CHECK: !{{[0-9]+}} = !{ptr @BufF16x2, i32 10, i32 8,
+// CHECK: !{{[0-9]+}} = !{ptr @BufF32x3, i32 10, i32 9,
diff --git a/clang/test/CodeGenHLSL/builtins/RasterizerOrderedBuffer-annotations.hlsl b/clang/test/CodeGenHLSL/builtins/RasterizerOrderedBuffer-annotations.hlsl
index bf70cc2456c8bc..5155f129025979 100644
--- a/clang/test/CodeGenHLSL/builtins/RasterizerOrderedBuffer-annotations.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/RasterizerOrderedBuffer-annotations.hlsl
@@ -12,9 +12,9 @@ RasterizerOrderedBuffer<vector<float, 4> > BufferArray3[4] : register(u4, space1
 void main() {}
 
 // CHECK: !hlsl.uavs = !{![[Single:[0-9]+]], ![[Array:[0-9]+]], ![[SingleAllocated:[0-9]+]], ![[ArrayAllocated:[0-9]+]], ![[SingleSpace:[0-9]+]], ![[ArraySpace:[0-9]+]]}
-// CHECK-DAG: ![[Single]] = !{ptr @"?Buffer1@@3V?$RasterizerOrderedBuffer@M@hlsl@@A", i32 10, i32 9, i1 true, i32 -1, i32 0}
-// CHECK-DAG: ![[Array]] = !{ptr @"?BufferArray@@3PAV?$RasterizerOrderedBuffer@T?$__vector@M$03@__clang@@@hlsl@@A", i32 10, i32 9, i1 true, i32 -1, i32 0}
-// CHECK-DAG: ![[SingleAllocated]] = !{ptr @"?Buffer2@@3V?$RasterizerOrderedBuffer@M@hlsl@@A", i32 10, i32 9, i1 true, i32 3, i32 0}
-// CHECK-DAG: ![[ArrayAllocated]] = !{ptr @"?BufferArray2@@3PAV?$RasterizerOrderedBuffer@T?$__vector@M$03@__clang@@@hlsl@@A", i32 10, i32 9, i1 true, i32 4, i32 0}
-// CHECK-DAG: ![[SingleSpace]] = !{ptr @"?Buffer3@@3V?$RasterizerOrderedBuffer@M@hlsl@@A", i32 10, i32 9, i1 true, i32 3, i32 1}
-// CHECK-DAG: ![[ArraySpace]] = !{ptr @"?BufferArray3@@3PAV?$RasterizerOrderedBuffer@T?$__vector@M$03@__clang@@@hlsl@@A", i32 10, i32 9, i1 true, i32 4, i32 1}
+// CHECK-DAG: ![[Single]] = !{ptr @Buffer1, i32 10, i32 9, i1 true, i32 -1, i32 0}
+// CHECK-DAG: ![[Array]] = !{ptr @BufferArray, i32 10, i32 9, i1 true, i32 -1, i32 0}
+// CHECK-DAG: ![[SingleAllocated]] = !{ptr @Buffer2, i32 10, i32 9, i1 true, i32 3, i32 0}
+// CHECK-DAG: ![[ArrayAllocated]] = !{ptr @BufferArray2, i32 10, i32 9, i1 true, i32 4, i32 0}
+// CHECK-DAG: ![[SingleSpace]] = !{ptr @Buffer3, i32 10, i32 9, i1 true, i32 3, i32 1}
+// CHECK-DAG: ![[ArraySpace]] = !{ptr @BufferArray3, i32 10, i32 9, i1 true, i32 4, i32 1}
diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-annotations.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-annotations.hlsl
index 16b7295c985f77..4d3d4908c396e6 100644
--- a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-annotations.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-annotations.hlsl
@@ -14,9 +14,9 @@ void main() {
 }
 
 // CHECK: !hlsl.uavs = !{![[Single:[0-9]+]], ![[Array:[0-9]+]], ![[SingleAllocated:[0-9]+]], ![[ArrayAllocated:[0-9]+]], ![[SingleSpace:[0-9]+]], ![[ArraySpace:[0-9]+]]}
-// CHECK-DAG: ![[Single]] = !{ptr @"?Buffer1@@3V?$StructuredBuffer@M@hlsl@@A", i32 10, i32 9, i1 false, i32 -1, i32 0}
-// CHECK-DAG: ![[Array]] = !{ptr @"?BufferArray@@3PAV?$StructuredBuffer@T?$__vector@M$03@__clang@@@hlsl@@A", i32 10, i32 9, i1 false, i32 -1, i32 0}
-// CHECK-DAG: ![[SingleAllocated]] = !{ptr @"?Buffer2@@3V?$StructuredBuffer@M@hlsl@@A", i32 10, i32 9, i1 false, i32 3, i32 0}
-// CHECK-DAG: ![[ArrayAllocated]] = !{ptr @"?BufferArray2@@3PAV?$StructuredBuffer@T?$__vector@M$03@__clang@@@hlsl@@A", i32 10, i32 9, i1 false, i32 4, i32 0}
-// CHECK-DAG: ![[SingleSpace]] = !{ptr @"?Buffer3@@3V?$StructuredBuffer@M@hlsl@@A", i32 10, i32 9, i1 false, i32 3, i32 1}
-// CHECK-DAG: ![[ArraySpace]] = !{ptr @"?BufferArray3@@3PAV?$StructuredBuffer@T?$__vector@M$03@__clang@@@hlsl@@A", i32 10, i32 9, i1 false, i32 4, i32 1}
+// CHECK-DAG: ![[Single]] = !{ptr @Buffer1, i32 10, i32 9, i1 false, i32 -1, i32 0}
+// CHECK-DAG: ![[Array]] = !{ptr @BufferArray, i32 10, i32 9, i1 false, i32 -1, i32 0}
+// CHECK-DAG: ![[SingleAllocated]] = !{ptr @Buffer2, i32 10, i32 9, i1 false, i32 3, i32 0}
+// CHECK-DAG: ![[ArrayAllocated]] = !{ptr @BufferArray2, i32 10, i32 9, i1 false, i32 4, i32 0}
+// CHECK-DAG: ![[SingleSpace]] = !{ptr @Buffer3, i32 10, i32 9, i1 false, i32 3, i32 1}
+// CHECK-DAG: ![[ArraySpace]] = !{ptr @BufferArray3, i32 10, i32 9, i1 false, i32 4, i32 1}
diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl
index 8ddf8a6004403e..326885efbeeaba 100644
--- a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl
@@ -37,16 +37,16 @@ void main(int GI : SV_GroupIndex) {
   BufF32x3[GI] = 0;
 }
 
-// CHECK: !{{[0-9]+}} = !{ptr @"?BufI16@@3V?$StructuredBuffer@F@hlsl@@A", i32 10, i32 2,
-// CHECK: !{{[0-9]+}} = !{ptr @"?BufU16@@3V?$StructuredBuffer@G@hlsl@@A", i32 10, i32 3,
-// CHECK: !{{[0-9]+}} = !{ptr @"?BufI32@@3V?$StructuredBuffer@H@hlsl@@A", i32 10, i32 4,
-// CHECK: !{{[0-9]+}} = !{ptr @"?BufU32@@3V?$StructuredBuffer@I@hlsl@@A", i32 10, i32 5,
-// CHECK: !{{[0-9]+}} = !{ptr @"?BufI64@@3V?$StructuredBuffer@J@hlsl@@A", i32 10, i32 6,
-// CHECK: !{{[0-9]+}} = !{ptr @"?BufU64@@3V?$StructuredBuffer@K@hlsl@@A", i32 10, i32 7,
-// CHECK: !{{[0-9]+}} = !{ptr @"?BufF16@@3V?$StructuredBuffer@$f16@@hlsl@@A", i32 10, i32 8,
-// CHECK: !{{[0-9]+}} = !{ptr @"?BufF32@@3V?$StructuredBuffer@M@hlsl@@A", i32 10, i32 9,
-// CHECK: !{{[0-9]+}} = !{ptr @"?BufF64@@3V?$StructuredBuffer@N@hlsl@@A", i32 10, i32 10,
-// CHECK: !{{[0-9]+}} = !{ptr @"?BufI16x4@@3V?$StructuredBuffer@T?$__vector@F$03@__clang@@@hlsl@@A", i32 10, i32 2,
-// CHECK: !{{[0-9]+}} = !{ptr @"?BufU32x3@@3V?$StructuredBuffer@T?$__vector@I$02@__clang@@@hlsl@@A", i32 10, i32 5,
-// CHECK: !{{[0-9]+}} = !{ptr @"?BufF16x2@@3V?$StructuredBuffer@T?$__vector@$f16@$01@__clang@@@hlsl@@A", i32 10, i32 8,
-// CHECK: !{{[0-9]+}} = !{ptr @"?BufF32x3@@3V?$StructuredBuffer@T?$__vector@M$02@__clang@@@hlsl@@A", i32 10, i32 9,
+// CHECK: !{{[0-9]+}} = !{ptr @BufI16, i32 10, i32 2,
+// CHECK: !{{[0-9]+}} = !{ptr @BufU16, i32 10, i32 3,
+// CHECK: !{{[0-9]+}} = !{ptr @BufI32, i32 10, i32 4,
+// CHECK: !{{[0-9]+}} = !{ptr @BufU32, i32 10, i32 5,
+// CHECK: !{{[0-9]+}} = !{ptr @BufI64, i32 10, i32 6,
+// CHECK: !{{[0-9]+}} = !{ptr @BufU64, i32 10, i32 7,
+// CHECK: !{{[0-9]+}} = !{ptr @BufF16, i32 10, i32 8,
+// CHECK: !{{[0-9]+}} = !{ptr @BufF32, i32 10, i32 9,
+// CHECK: !{{[0-9]+}} = !{ptr @BufF64, i32 10, i32 10,
+// CHECK: !{{[0-9]+}} = !{ptr @BufI16x4, i32 10, i32 2,
+// CHECK: !{{[0-9]+}} = !{ptr @BufU32x3, i32 10, i32 5,
+// CHECK: !{{[0-9]+}} = !{ptr @BufF16x2, i32 10, i32 8,
+// CHECK: !{{[0-9]+}} = !{ptr @BufF32x3, i32 10, i32 9,
diff --git a/clang/test/CodeGenHLSL/builtins/abs.hlsl b/clang/test/CodeGenHLSL/builtins/abs.hlsl
index ad65cab2721a2b..912e8a28347237 100644
--- a/clang/test/CodeGenHLSL/builtins/abs.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/abs.hlsl
@@ -1,93 +1,96 @@
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
-// RUN:   --check-prefixes=CHECK,NATIVE_HALF
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
 using hlsl::abs;
 
 #ifdef __HLSL_ENABLE_16_BIT
-// NATIVE_HALF: define noundef i16 @
+// NATIVE_HALF-LABEL: define noundef i16 @_Z16test_abs_int16_t
 // NATIVE_HALF: call i16 @llvm.abs.i16(
 int16_t test_abs_int16_t(int16_t p0) { return abs(p0); }
-// NATIVE_HALF: define noundef <2 x i16> @
+// NATIVE_HALF-LABEL: define noundef <2 x i16> @_Z17test_abs_int16_t2
 // NATIVE_HALF: call <2 x i16> @llvm.abs.v2i16(
 int16_t2 test_abs_int16_t2(int16_t2 p0) { return abs(p0); }
-// NATIVE_HALF: define noundef <3 x i16> @
+// NATIVE_HALF-LABEL: define noundef <3 x i16> @_Z17test_abs_int16_t3
 // NATIVE_HALF: call <3 x i16> @llvm.abs.v3i16(
 int16_t3 test_abs_int16_t3(int16_t3 p0) { return abs(p0); }
-// NATIVE_HALF: define noundef <4 x i16> @
+// NATIVE_HALF-LABEL: define noundef <4 x i16> @_Z17test_abs_int16_t4
 // NATIVE_HALF: call <4 x i16> @llvm.abs.v4i16(
 int16_t4 test_abs_int16_t4(int16_t4 p0) { return abs(p0); }
 #endif // __HLSL_ENABLE_16_BIT
 
-// NATIVE_HALF: define noundef half @
+// NATIVE_HALF-LABEL: define noundef half @_Z13test_abs_half
 // NATIVE_HALF: call half @llvm.fabs.f16(
-// NO_HALF: define noundef float @"?test_abs_half@@YA$halff@$halff@@Z"(
+// NO_HALF-LABEL: define noundef float @_Z13test_abs_half
 // NO_HALF: call float @llvm.fabs.f32(float %0)
 half test_abs_half(half p0) { return abs(p0); }
-// NATIVE_HALF: define noundef <2 x half> @
+// NATIVE_HALF-LABEL: define noundef <2 x half> @_Z14test_abs_half2
 // NATIVE_HALF: call <2 x half> @llvm.fabs.v2f16(
-// NO_HALF: define noundef <2 x float> @"?test_abs_half2@@YAT?$__vector@$halff@$01@__clang@@T12@@Z"(
+// NO_HALF-LABEL: define noundef <2 x float> @_Z14test_abs_half2
 // NO_HALF: call <2 x float> @llvm.fabs.v2f32(
 half2 test_abs_half2(half2 p0) { return abs(p0); }
-// NATIVE_HALF: define noundef <3 x half> @
+// NATIVE_HALF-LABEL: define noundef <3 x half> @_Z14test_abs_half3
 // NATIVE_HALF: call <3 x half> @llvm.fabs.v3f16(
-// NO_HALF: define noundef <3 x float> @"?test_abs_half3@@YAT?$__vector@$halff@$02@__clang@@T12@@Z"(
+// NO_HALF-LABEL: define noundef <3 x float> @_Z14test_abs_half3
 // NO_HALF: call <3 x float> @llvm.fabs.v3f32(
 half3 test_abs_half3(half3 p0) { return abs(p0); }
-// NATIVE_HALF: define noundef <4 x half> @
+// NATIVE_HALF-LABEL: define noundef <4 x half> @_Z14test_abs_half4
 // NATIVE_HALF: call <4 x half> @llvm.fabs.v4f16(
-// NO_HALF: define noundef <4 x float> @"?test_abs_half4@@YAT?$__vector@$halff@$03@__clang@@T12@@Z"(
+// NO_HALF-LABEL: define noundef <4 x float> @_Z14test_abs_half4
 // NO_HALF: call <4 x float> @llvm.fabs.v4f32(
 half4 test_abs_half4(half4 p0) { return abs(p0); }
-// CHECK: define noundef i32 @
+
+// CHECK-LABEL: define noundef i32 @_Z12test_abs_int
 // CHECK: call i32 @llvm.abs.i32(
 int test_abs_int(int p0) { return abs(p0); }
-// CHECK: define noundef <2 x i32> @
+// CHECK-LABEL: define noundef <2 x i32> @_Z13test_abs_int2
 // CHECK: call <2 x i32> @llvm.abs.v2i32(
 int2 test_abs_int2(int2 p0) { return abs(p0); }
-// CHECK: define noundef <3 x i32> @
+// CHECK-LABEL: define noundef <3 x i32> @_Z13test_abs_int3
 // CHECK: call <3 x i32> @llvm.abs.v3i32(
 int3 test_abs_int3(int3 p0) { return abs(p0); }
-// CHECK: define noundef <4 x i32> @
+// CHECK-LABEL: define noundef <4 x i32> @_Z13test_abs_int4
 // CHECK: call <4 x i32> @llvm.abs.v4i32(
 int4 test_abs_int4(int4 p0) { return abs(p0); }
-// CHECK: define noundef float @
+
+// CHECK-LABEL: define noundef float @_Z14test_abs_float
 // CHECK: call float @llvm.fabs.f32(
 float test_abs_float(float p0) { return abs(p0); }
-// CHECK: define noundef <2 x float> @
+// CHECK-LABEL: define noundef <2 x float> @_Z15test_abs_float2
 // CHECK: call <2 x float> @llvm.fabs.v2f32(
 float2 test_abs_float2(float2 p0) { return abs(p0); }
-// CHECK: define noundef <3 x float> @
+// CHECK-LABEL: define noundef <3 x float> @_Z15test_abs_float3
 // CHECK: call <3 x float> @llvm.fabs.v3f32(
 float3 test_abs_float3(float3 p0) { return abs(p0); }
-// CHECK: define noundef <4 x float> @
+// CHECK-LABEL: define noundef <4 x float> @_Z15test_abs_float4
 // CHECK: call <4 x float> @llvm.fabs.v4f32(
 float4 test_abs_float4(float4 p0) { return abs(p0); }
-// CHECK: define noundef i64 @
+
+// CHECK-LABEL: define noundef i64 @_Z16test_abs_int64_t
 // CHECK: call i64 @llvm.abs.i64(
 int64_t test_abs_int64_t(int64_t p0) { return abs(p0); }
-// CHECK: define noundef <2 x i64> @
+// CHECK-LABEL: define noundef <2 x i64> @_Z17test_abs_int64_t2
 // CHECK: call <2 x i64> @llvm.abs.v2i64(
 int64_t2 test_abs_int64_t2(int64_t2 p0) { return abs(p0); }
-// CHECK: define noundef <3 x i64> @
+// CHECK-LABEL: define noundef <3 x i64> @_Z17test_abs_int64_t3
 // CHECK: call <3 x i64> @llvm.abs.v3i64(
 int64_t3 test_abs_int64_t3(int64_t3 p0) { return abs(p0); }
-// CHECK: define noundef <4 x i64> @
+// CHECK-LABEL: define noundef <4 x i64> @_Z17test_abs_int64_t4
 // CHECK: call <4 x i64> @llvm.abs.v4i64(
 int64_t4 test_abs_int64_t4(int64_t4 p0) { return abs(p0); }
-// CHECK: define noundef double @
+
+// CHECK-LABEL: define noundef double @_Z15test_abs_double
 // CHECK: call double @llvm.fabs.f64(
 double test_abs_double(double p0) { return abs(p0); }
-// CHECK: define noundef <2 x double> @
+// CHECK-LABEL: define noundef <2 x double> @_Z16test_abs_double2
 // CHECK: call <2 x double> @llvm.fabs.v2f64(
 double2 test_abs_double2(double2 p0) { return abs(p0); }
-// CHECK: define noundef <3 x double> @
+// CHECK-LABEL: define noundef <3 x double> @_Z16test_abs_double3
 // CHECK: call <3 x double> @llvm.fabs.v3f64(
 double3 test_abs_double3(double3 p0) { return abs(p0); }
-// CHECK: define noundef <4 x double> @
+// CHECK-LABEL: define noundef <4 x double> @_Z16test_abs_double4
 // CHECK: call <4 x double> @llvm.fabs.v4f64(
 double4 test_abs_double4(double4 p0) { return abs(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/ceil.hlsl b/clang/test/CodeGenHLSL/builtins/ceil.hlsl
index be7725cd4d66c1..3aa78ec0ebcca3 100644
--- a/clang/test/CodeGenHLSL/builtins/ceil.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/ceil.hlsl
@@ -1,43 +1,42 @@
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
-// RUN:   --check-prefixes=CHECK,NATIVE_HALF
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
 using hlsl::ceil;
 
-// NATIVE_HALF: define noundef half @
+// NATIVE_HALF-LABEL: define noundef half @_Z14test_ceil_half
 // NATIVE_HALF: call half @llvm.ceil.f16(
-// NO_HALF: define noundef float @"?test_ceil_half@@YA$halff@$halff@@Z"(
+// NO_HALF-LABEL: define noundef float @_Z14test_ceil_half
 // NO_HALF: call float @llvm.ceil.f32(float %0)
 half test_ceil_half(half p0) { return ceil(p0); }
-// NATIVE_HALF: define noundef <2 x half> @
+// NATIVE_HALF-LABEL: define noundef <2 x half> @_Z15test_ceil_half2
 // NATIVE_HALF: call <2 x half> @llvm.ceil.v2f16(
-// NO_HALF: define noundef <2 x float> @"?test_ceil_half2@@YAT?$__vector@$halff@$01@__clang@@T12@@Z"(
+// NO_HALF-LABEL: define noundef <2 x float> @_Z15test_ceil_half2
 // NO_HALF: call <2 x float> @llvm.ceil.v2f32(
 half2 test_ceil_half2(half2 p0) { return ceil(p0); }
-// NATIVE_HALF: define noundef <3 x half> @
+// NATIVE_HALF-LABEL: define noundef <3 x half> @_Z15test_ceil_half3
 // NATIVE_HALF: call <3 x half> @llvm.ceil.v3f16(
-// NO_HALF: define noundef <3 x float> @"?test_ceil_half3@@YAT?$__vector@$halff@$02@__clang@@T12@@Z"(
+// NO_HALF-LABEL: define noundef <3 x float> @_Z15test_ceil_half3
 // NO_HALF: call <3 x float> @llvm.ceil.v3f32(
 half3 test_ceil_half3(half3 p0) { return ceil(p0); }
-// NATIVE_HALF: define noundef <4 x half> @
+// NATIVE_HALF-LABEL: define noundef <4 x half> @_Z15test_ceil_half4
 // NATIVE_HALF: call <4 x half> @llvm.ceil.v4f16(
-// NO_HALF: define noundef <4 x float> @"?test_ceil_half4@@YAT?$__vector@$halff@$03@__clang@@T12@@Z"(
+// NO_HALF-LABEL: define noundef <4 x float> @_Z15test_ceil_half4
 // NO_HALF: call <4 x float> @llvm.ceil.v4f32(
 half4 test_ceil_half4(half4 p0) { return ceil(p0); }
 
-// CHECK: define noundef float @
+// CHECK-LABEL: define noundef float @_Z15test_ceil_float
 // CHECK: call float @llvm.ceil.f32(
 float test_ceil_float(float p0) { return ceil(p0); }
-// CHECK: define noundef <2 x float> @
+// CHECK-LABEL: define noundef <2 x float> @_Z16test_ceil_float2
 // CHECK: call <2 x float> @llvm.ceil.v2f32(
 float2 test_ceil_float2(float2 p0) { return ceil(p0); }
-// CHECK: define noundef <3 x float> @
+// CHECK-LABEL: define noundef <3 x float> @_Z16test_ceil_float3
 // CHECK: call <3 x float> @llvm.ceil.v3f32(
 float3 test_ceil_float3(float3 p0) { return ceil(p0); }
-// CHECK: define noundef <4 x float> @
+// CHECK-LABEL: define noundef <4 x float> @_Z16test_ceil_float4
 // CHECK: call <4 x float> @llvm.ceil.v4f32(
 float4 test_ceil_float4(float4 p0) { return ceil(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/clamp.hlsl b/clang/test/CodeGenHLSL/builtins/clamp.hlsl
index 186114581e9c18..af8f6b9733a071 100644
--- a/clang/test/CodeGenHLSL/builtins/clamp.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/clamp.hlsl
@@ -1,134 +1,133 @@
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
-// RUN:   --check-prefixes=CHECK,NATIVE_HALF
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
 #ifdef __HLSL_ENABLE_16_BIT
-// NATIVE_HALF: define noundef i16 @
+// NATIVE_HALF-LABEL: define noundef i16 @_Z16test_clamp_short
 // NATIVE_HALF: call i16 @llvm.dx.clamp.i16(
 int16_t test_clamp_short(int16_t p0, int16_t p1) { return clamp(p0, p1,p1); }
-// NATIVE_HALF: define noundef <2 x i16> @
+// NATIVE_HALF-LABEL: define noundef <2 x i16> @_Z17test_clamp_short2
 // NATIVE_HALF: call <2 x i16> @llvm.dx.clamp.v2i16(
 int16_t2 test_clamp_short2(int16_t2 p0, int16_t2 p1) { return clamp(p0, p1,p1); }
-// NATIVE_HALF: define noundef <3 x i16> @
+// NATIVE_HALF-LABEL: define noundef <3 x i16> @_Z17test_clamp_short3
 // NATIVE_HALF: call <3 x i16> @llvm.dx.clamp.v3i16
 int16_t3 test_clamp_short3(int16_t3 p0, int16_t3 p1) { return clamp(p0, p1,p1); }
-// NATIVE_HALF: define noundef <4 x i16> @
+// NATIVE_HALF-LABEL: define noundef <4 x i16> @_Z17test_clamp_short4
 // NATIVE_HALF: call <4 x i16> @llvm.dx.clamp.v4i16
 int16_t4 test_clamp_short4(int16_t4 p0, int16_t4 p1) { return clamp(p0, p1,p1); }
 
-// NATIVE_HALF: define noundef i16 @
+// NATIVE_HALF-LABEL: define noundef i16 @_Z17test_clamp_ushort
 // NATIVE_HALF: call i16 @llvm.dx.uclamp.i16(
 uint16_t test_clamp_ushort(uint16_t p0, uint16_t p1) { return clamp(p0, p1,p1); }
-// NATIVE_HALF: define noundef <2 x i16> @
+// NATIVE_HALF-LABEL: define noundef <2 x i16> @_Z18test_clamp_ushort2
 // NATIVE_HALF: call <2 x i16> @llvm.dx.uclamp.v2i16
 uint16_t2 test_clamp_ushort2(uint16_t2 p0, uint16_t2 p1) { return clamp(p0, p1,p1); }
-// NATIVE_HALF: define noundef <3 x i16> @
+// NATIVE_HALF-LABEL: define noundef <3 x i16> @_Z18test_clamp_ushort3
 // NATIVE_HALF: call <3 x i16> @llvm.dx.uclamp.v3i16
 uint16_t3 test_clamp_ushort3(uint16_t3 p0, uint16_t3 p1) { return clamp(p0, p1,p1); }
-// NATIVE_HALF: define noundef <4 x i16> @
+// NATIVE_HALF-LABEL: define noundef <4 x i16> @_Z18test_clamp_ushort4
 // NATIVE_HALF: call <4 x i16> @llvm.dx.uclamp.v4i16
 uint16_t4 test_clamp_ushort4(uint16_t4 p0, uint16_t4 p1) { return clamp(p0, p1,p1); }
 #endif
 
-// CHECK: define noundef i32 @
+// CHECK-LABEL: define noundef i32 @_Z14test_clamp_int
 // CHECK: call i32 @llvm.dx.clamp.i32(
 int test_clamp_int(int p0, int p1) { return clamp(p0, p1,p1); }
-// CHECK: define noundef <2 x i32> @
+// CHECK-LABEL: define noundef <2 x i32> @_Z15test_clamp_int2
 // CHECK: call <2 x i32> @llvm.dx.clamp.v2i32
 int2 test_clamp_int2(int2 p0, int2 p1) { return clamp(p0, p1,p1); }
-// CHECK: define noundef <3 x i32> @
+// CHECK-LABEL: define noundef <3 x i32> @_Z15test_clamp_int3
 // CHECK: call <3 x i32> @llvm.dx.clamp.v3i32
 int3 test_clamp_int3(int3 p0, int3 p1) { return clamp(p0, p1,p1); }
-// CHECK: define noundef <4 x i32> @
+// CHECK-LABEL: define noundef <4 x i32> @_Z15test_clamp_int4
 // CHECK: call <4 x i32> @llvm.dx.clamp.v4i32
 int4 test_clamp_int4(int4 p0, int4 p1) { return clamp(p0, p1,p1); }
 
-// CHECK: define noundef i32 @
+// CHECK-LABEL: define noundef i32 @_Z15test_clamp_uint
 // CHECK: call i32 @llvm.dx.uclamp.i32(
 int test_clamp_uint(uint p0, uint p1) { return clamp(p0, p1,p1); }
-// CHECK: define noundef <2 x i32> @
+// CHECK-LABEL: define noundef <2 x i32> @_Z16test_clamp_uint2
 // CHECK: call <2 x i32> @llvm.dx.uclamp.v2i32
 uint2 test_clamp_uint2(uint2 p0, uint2 p1) { return clamp(p0, p1,p1); }
-// CHECK: define noundef <3 x i32> @
+// CHECK-LABEL: define noundef <3 x i32> @_Z16test_clamp_uint3
 // CHECK: call <3 x i32> @llvm.dx.uclamp.v3i32
 uint3 test_clamp_uint3(uint3 p0, uint3 p1) { return clamp(p0, p1,p1); }
-// CHECK: define noundef <4 x i32> @
+// CHECK-LABEL: define noundef <4 x i32> @_Z16test_clamp_uint4
 // CHECK: call <4 x i32> @llvm.dx.uclamp.v4i32
 uint4 test_clamp_uint4(uint4 p0, uint4 p1) { return clamp(p0, p1,p1); }
 
-// CHECK: define noundef i64 @
+// CHECK-LABEL: define noundef i64 @_Z15test_clamp_long
 // CHECK: call i64 @llvm.dx.clamp.i64(
 int64_t test_clamp_long(int64_t p0, int64_t p1) { return clamp(p0, p1,p1); }
-// CHECK: define noundef <2 x i64> @
+// CHECK-LABEL: define noundef <2 x i64> @_Z16test_clamp_long2
 // CHECK: call <2 x i64> @llvm.dx.clamp.v2i64
 int64_t2 test_clamp_long2(int64_t2 p0, int64_t2 p1) { return clamp(p0, p1,p1); }
-// CHECK: define noundef <3 x i64> @
+// CHECK-LABEL: define noundef <3 x i64> @_Z16test_clamp_long3
 // CHECK: call <3 x i64> @llvm.dx.clamp.v3i64
 int64_t3 test_clamp_long3(int64_t3 p0, int64_t3 p1) { return clamp(p0, p1,p1); }
-// CHECK: define noundef <4 x i64> @
+// CHECK-LABEL: define noundef <4 x i64> @_Z16test_clamp_long4
 // CHECK: call <4 x i64> @llvm.dx.clamp.v4i64
 int64_t4 test_clamp_long4(int64_t4 p0, int64_t4 p1) { return clamp(p0, p1,p1); }
 
-// CHECK: define noundef i64 @
+// CHECK-LABEL: define noundef i64 @_Z16test_clamp_ulong
 // CHECK: call i64 @llvm.dx.uclamp.i64(
-uint64_t test_clamp_long(uint64_t p0, uint64_t p1) { return clamp(p0, p1,p1); }
-// CHECK: define noundef <2 x i64> @
+uint64_t test_clamp_ulong(uint64_t p0, uint64_t p1) { return clamp(p0, p1,p1); }
+// CHECK-LABEL: define noundef <2 x i64> @_Z17test_clamp_ulong2
 // CHECK: call <2 x i64> @llvm.dx.uclamp.v2i64
-uint64_t2 test_clamp_long2(uint64_t2 p0, uint64_t2 p1) { return clamp(p0, p1,p1); }
-// CHECK: define noundef <3 x i64> @
+uint64_t2 test_clamp_ulong2(uint64_t2 p0, uint64_t2 p1) { return clamp(p0, p1,p1); }
+// CHECK-LABEL: define noundef <3 x i64> @_Z17test_clamp_ulong3
 // CHECK: call <3 x i64> @llvm.dx.uclamp.v3i64
-uint64_t3 test_clamp_long3(uint64_t3 p0, uint64_t3 p1) { return clamp(p0, p1,p1); }
-// CHECK: define noundef <4 x i64> @
+uint64_t3 test_clamp_ulong3(uint64_t3 p0, uint64_t3 p1) { return clamp(p0, p1,p1); }
+// CHECK-LABEL: define noundef <4 x i64> @_Z17test_clamp_ulong4
 // CHECK: call <4 x i64> @llvm.dx.uclamp.v4i64
-uint64_t4 test_clamp_long4(uint64_t4 p0, uint64_t4 p1) { return clamp(p0, p1,p1); }
+uint64_t4 test_clamp_ulong4(uint64_t4 p0, uint64_t4 p1) { return clamp(p0, p1,p1); }
 
-// NATIVE_HALF: define noundef half @
+// NATIVE_HALF-LABEL: define noundef half @_Z15test_clamp_half
 // NATIVE_HALF: call half @llvm.dx.clamp.f16(
-// NO_HALF: define noundef float @"?test_clamp_half
+// NO_HALF-LABEL: define noundef float @_Z15test_clamp_half
 // NO_HALF: call float @llvm.dx.clamp.f32(
 half test_clamp_half(half p0, half p1) { return clamp(p0, p1,p1); }
-// NATIVE_HALF: define noundef <2 x half> @
+// NATIVE_HALF-LABEL: define noundef <2 x half> @_Z16test_clamp_half2
 // NATIVE_HALF: call <2 x half> @llvm.dx.clamp.v2f16
-// NO_HALF: define noundef <2 x float> @"?test_clamp_half2
+// NO_HALF-LABEL: define noundef <2 x float> @_Z16test_clamp_half2
 // NO_HALF: call <2 x float> @llvm.dx.clamp.v2f32(
 half2 test_clamp_half2(half2 p0, half2 p1) { return clamp(p0, p1,p1); }
-// NATIVE_HALF: define noundef <3 x half> @
+// NATIVE_HALF-LABEL: define noundef <3 x half> @_Z16test_clamp_half3
 // NATIVE_HALF: call <3 x half> @llvm.dx.clamp.v3f16
-// NO_HALF: define noundef <3 x float> @"?test_clamp_half3
+// NO_HALF-LABEL: define noundef <3 x float> @_Z16test_clamp_half3
 // NO_HALF: call <3 x float> @llvm.dx.clamp.v3f32(
 half3 test_clamp_half3(half3 p0, half3 p1) { return clamp(p0, p1,p1); }
-// NATIVE_HALF: define noundef <4 x half> @
+// NATIVE_HALF-LABEL: define noundef <4 x half> @_Z16test_clamp_half4
 // NATIVE_HALF: call <4 x half> @llvm.dx.clamp.v4f16
-// NO_HALF: define noundef <4 x float> @"?test_clamp_half4
+// NO_HALF-LABEL: define noundef <4 x float> @_Z16test_clamp_half4
 // NO_HALF: call <4 x float> @llvm.dx.clamp.v4f32(
 half4 test_clamp_half4(half4 p0, half4 p1) { return clamp(p0, p1,p1); }
 
-// CHECK: define noundef float @"?test_clamp_float
+// CHECK-LABEL: define noundef float @_Z16test_clamp_float
 // CHECK: call float @llvm.dx.clamp.f32(
 float test_clamp_float(float p0, float p1) { return clamp(p0, p1,p1); }
-// CHECK: define noundef <2 x float> @"?test_clamp_float2
+// CHECK-LABEL: define noundef <2 x float> @_Z17test_clamp_float2
 // CHECK: call <2 x float> @llvm.dx.clamp.v2f32
 float2 test_clamp_float2(float2 p0, float2 p1) { return clamp(p0, p1,p1); }
-// CHECK: define noundef <3 x float> @"?test_clamp_float3
+// CHECK-LABEL: define noundef <3 x float> @_Z17test_clamp_float3
 // CHECK: call <3 x float> @llvm.dx.clamp.v3f32
 float3 test_clamp_float3(float3 p0, float3 p1) { return clamp(p0, p1,p1); }
-// CHECK: define noundef <4 x float> @"?test_clamp_float4
+// CHECK-LABEL: define noundef <4 x float> @_Z17test_clamp_float4
 // CHECK: call <4 x float> @llvm.dx.clamp.v4f32
 float4 test_clamp_float4(float4 p0, float4 p1) { return clamp(p0, p1,p1); }
 
-// CHECK: define noundef double @
+// CHECK-LABEL: define noundef double @_Z17test_clamp_double
 // CHECK: call double @llvm.dx.clamp.f64(
 double test_clamp_double(double p0, double p1) { return clamp(p0, p1,p1); }
-// CHECK: define noundef <2 x double> @
+// CHECK-LABEL: define noundef <2 x double> @_Z18test_clamp_double2
 // CHECK: call <2 x double> @llvm.dx.clamp.v2f64
 double2 test_clamp_double2(double2 p0, double2 p1) { return clamp(p0, p1,p1); }
-// CHECK: define noundef <3 x double> @
+// CHECK-LABEL: define noundef <3 x double> @_Z18test_clamp_double3
 // CHECK: call <3 x double> @llvm.dx.clamp.v3f64
 double3 test_clamp_double3(double3 p0, double3 p1) { return clamp(p0, p1,p1); }
-// CHECK: define noundef <4 x double> @
+// CHECK-LABEL: define noundef <4 x double> @_Z18test_clamp_double4
 // CHECK: call <4 x double> @llvm.dx.clamp.v4f64
 double4 test_clamp_double4(double4 p0, double4 p1) { return clamp(p0, p1,p1); }
diff --git a/clang/test/CodeGenHLSL/builtins/cos.hlsl b/clang/test/CodeGenHLSL/builtins/cos.hlsl
index 58b63097788136..4a41a9ec4a7cac 100644
--- a/clang/test/CodeGenHLSL/builtins/cos.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/cos.hlsl
@@ -1,41 +1,40 @@
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
-// RUN:   --check-prefixes=CHECK,NATIVE_HALF
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF: define noundef half @
+// NATIVE_HALF-LABEL: define noundef half @_Z13test_cos_half
 // NATIVE_HALF: call half @llvm.cos.f16(
-// NO_HALF: define noundef float @"?test_cos_half
+// NO_HALF-LABEL: define noundef float @_Z13test_cos_half
 // NO_HALF: call float @llvm.cos.f32(
 half test_cos_half(half p0) { return cos(p0); }
-// NATIVE_HALF: define noundef <2 x half> @
+// NATIVE_HALF-LABEL: define noundef <2 x half> @_Z14test_cos_half2
 // NATIVE_HALF: call <2 x half> @llvm.cos.v2f16
-// NO_HALF: define noundef <2 x float> @"?test_cos_half2
+// NO_HALF-LABEL: define noundef <2 x float> @_Z14test_cos_half2
 // NO_HALF: call <2 x float> @llvm.cos.v2f32(
 half2 test_cos_half2(half2 p0) { return cos(p0); }
-// NATIVE_HALF: define noundef <3 x half> @
+// NATIVE_HALF-LABEL: define noundef <3 x half> @_Z14test_cos_half3
 // NATIVE_HALF: call <3 x half> @llvm.cos.v3f16
-// NO_HALF: define noundef <3 x float> @"?test_cos_half3
+// NO_HALF-LABEL: define noundef <3 x float> @_Z14test_cos_half3
 // NO_HALF: call <3 x float> @llvm.cos.v3f32(
 half3 test_cos_half3(half3 p0) { return cos(p0); }
-// NATIVE_HALF: define noundef <4 x half> @
+// NATIVE_HALF-LABEL: define noundef <4 x half> @_Z14test_cos_half4
 // NATIVE_HALF: call <4 x half> @llvm.cos.v4f16
-// NO_HALF: define noundef <4 x float> @"?test_cos_half4
+// NO_HALF-LABEL: define noundef <4 x float> @_Z14test_cos_half4
 // NO_HALF: call <4 x float> @llvm.cos.v4f32(
 half4 test_cos_half4(half4 p0) { return cos(p0); }
 
-// CHECK: define noundef float @"?test_cos_float
+// CHECK-LABEL: define noundef float @_Z14test_cos_float
 // CHECK: call float @llvm.cos.f32(
 float test_cos_float(float p0) { return cos(p0); }
-// CHECK: define noundef <2 x float> @"?test_cos_float2
+// CHECK-LABEL: define noundef <2 x float> @_Z15test_cos_float2
 // CHECK: call <2 x float> @llvm.cos.v2f32
 float2 test_cos_float2(float2 p0) { return cos(p0); }
-// CHECK: define noundef <3 x float> @"?test_cos_float3
+// CHECK-LABEL: define noundef <3 x float> @_Z15test_cos_float3
 // CHECK: call <3 x float> @llvm.cos.v3f32
 float3 test_cos_float3(float3 p0) { return cos(p0); }
-// CHECK: define noundef <4 x float> @"?test_cos_float4
+// CHECK-LABEL: define noundef <4 x float> @_Z15test_cos_float4
 // CHECK: call <4 x float> @llvm.cos.v4f32
 float4 test_cos_float4(float4 p0) { return cos(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/exp.hlsl b/clang/test/CodeGenHLSL/builtins/exp.hlsl
index 773edbe3364fd2..3445cfd2e71f60 100644
--- a/clang/test/CodeGenHLSL/builtins/exp.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/exp.hlsl
@@ -1,53 +1,52 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
-// RUN:   --check-prefixes=CHECK,NATIVE_HALF
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF: define noundef half @
+// NATIVE_HALF-LABEL: define noundef half @_Z13test_exp_half
 // NATIVE_HALF: %elt.exp = call half @llvm.exp.f16(
 // NATIVE_HALF: ret half %elt.exp
-// NO_HALF: define noundef float @"?test_exp_half@@YA$halff@$halff@@Z"(
+// NO_HALF-LABEL: define noundef float @_Z13test_exp_half
 // NO_HALF: %elt.exp = call float @llvm.exp.f32(
 // NO_HALF: ret float %elt.exp
 half test_exp_half(half p0) { return exp(p0); }
-// NATIVE_HALF: define noundef <2 x half> @
+// NATIVE_HALF-LABEL: define noundef <2 x half> @_Z14test_exp_half2
 // NATIVE_HALF: %elt.exp = call <2 x half> @llvm.exp.v2f16
 // NATIVE_HALF: ret <2 x half> %elt.exp
-// NO_HALF: define noundef <2 x float> @
+// NO_HALF-LABEL: define noundef <2 x float> @_Z14test_exp_half2
 // NO_HALF: %elt.exp = call <2 x float> @llvm.exp.v2f32(
 // NO_HALF: ret <2 x float> %elt.exp
 half2 test_exp_half2(half2 p0) { return exp(p0); }
-// NATIVE_HALF: define noundef <3 x half> @
+// NATIVE_HALF-LABEL: define noundef <3 x half> @_Z14test_exp_half3
 // NATIVE_HALF: %elt.exp = call <3 x half> @llvm.exp.v3f16
 // NATIVE_HALF: ret <3 x half> %elt.exp
-// NO_HALF: define noundef <3 x float> @
+// NO_HALF-LABEL: define noundef <3 x float> @_Z14test_exp_half3
 // NO_HALF: %elt.exp = call <3 x float> @llvm.exp.v3f32(
 // NO_HALF: ret <3 x float> %elt.exp
 half3 test_exp_half3(half3 p0) { return exp(p0); }
-// NATIVE_HALF: define noundef <4 x half> @
+// NATIVE_HALF-LABEL: define noundef <4 x half> @_Z14test_exp_half4
 // NATIVE_HALF: %elt.exp = call <4 x half> @llvm.exp.v4f16
 // NATIVE_HALF: ret <4 x half> %elt.exp
-// NO_HALF: define noundef <4 x float> @
+// NO_HALF-LABEL: define noundef <4 x float> @_Z14test_exp_half4
 // NO_HALF: %elt.exp = call <4 x float> @llvm.exp.v4f32(
 // NO_HALF: ret <4 x float> %elt.exp
 half4 test_exp_half4(half4 p0) { return exp(p0); }
 
-// CHECK: define noundef float @
+// CHECK-LABEL: define noundef float @_Z14test_exp_float
 // CHECK: %elt.exp = call float @llvm.exp.f32(
 // CHECK: ret float %elt.exp
 float test_exp_float(float p0) { return exp(p0); }
-// CHECK: define noundef <2 x float> @
+// CHECK-LABEL: define noundef <2 x float> @_Z15test_exp_float2
 // CHECK: %elt.exp = call <2 x float> @llvm.exp.v2f32
 // CHECK: ret <2 x float> %elt.exp
 float2 test_exp_float2(float2 p0) { return exp(p0); }
-// CHECK: define noundef <3 x float> @
+// CHECK-LABEL: define noundef <3 x float> @_Z15test_exp_float3
 // CHECK: %elt.exp = call <3 x float> @llvm.exp.v3f32
 // CHECK: ret <3 x float> %elt.exp
 float3 test_exp_float3(float3 p0) { return exp(p0); }
-// CHECK: define noundef <4 x float> @
+// CHECK-LABEL: define noundef <4 x float> @_Z15test_exp_float4
 // CHECK: %elt.exp = call <4 x float> @llvm.exp.v4f32
 // CHECK: ret <4 x float> %elt.exp
 float4 test_exp_float4(float4 p0) { return exp(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/exp2.hlsl b/clang/test/CodeGenHLSL/builtins/exp2.hlsl
index f21cdd95774ab6..7bfc897beee16d 100644
--- a/clang/test/CodeGenHLSL/builtins/exp2.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/exp2.hlsl
@@ -1,53 +1,52 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
-// RUN:   --check-prefixes=CHECK,NATIVE_HALF
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF: define noundef half @
+// NATIVE_HALF-LABEL: define noundef half @_Z14test_exp2_half
 // NATIVE_HALF: %elt.exp2 = call half @llvm.exp2.f16(
 // NATIVE_HALF: ret half %elt.exp2
-// NO_HALF: define noundef float @"?test_exp2_half@@YA$halff@$halff@@Z"(
+// NO_HALF-LABEL: define noundef float @_Z14test_exp2_half
 // NO_HALF: %elt.exp2 = call float @llvm.exp2.f32(
 // NO_HALF: ret float %elt.exp2
 half test_exp2_half(half p0) { return exp2(p0); }
-// NATIVE_HALF: define noundef <2 x half> @
+// NATIVE_HALF-LABEL: define noundef <2 x half> @_Z15test_exp2_half2
 // NATIVE_HALF: %elt.exp2 = call <2 x half> @llvm.exp2.v2f16
 // NATIVE_HALF: ret <2 x half> %elt.exp2
-// NO_HALF: define noundef <2 x float> @
+// NO_HALF-LABEL: define noundef <2 x float> @_Z15test_exp2_half2
 // NO_HALF: %elt.exp2 = call <2 x float> @llvm.exp2.v2f32(
 // NO_HALF: ret <2 x float> %elt.exp2
 half2 test_exp2_half2(half2 p0) { return exp2(p0); }
-// NATIVE_HALF: define noundef <3 x half> @
+// NATIVE_HALF-LABEL: define noundef <3 x half> @_Z15test_exp2_half3
 // NATIVE_HALF: %elt.exp2 = call <3 x half> @llvm.exp2.v3f16
 // NATIVE_HALF: ret <3 x half> %elt.exp2
-// NO_HALF: define noundef <3 x float> @
+// NO_HALF-LABEL: define noundef <3 x float> @_Z15test_exp2_half3
 // NO_HALF: %elt.exp2 = call <3 x float> @llvm.exp2.v3f32(
 // NO_HALF: ret <3 x float> %elt.exp2
 half3 test_exp2_half3(half3 p0) { return exp2(p0); }
-// NATIVE_HALF: define noundef <4 x half> @
+// NATIVE_HALF-LABEL: define noundef <4 x half> @_Z15test_exp2_half4
 // NATIVE_HALF: %elt.exp2 = call <4 x half> @llvm.exp2.v4f16
 // NATIVE_HALF: ret <4 x half> %elt.exp2
-// NO_HALF: define noundef <4 x float> @
+// NO_HALF-LABEL: define noundef <4 x float> @_Z15test_exp2_half4
 // NO_HALF: %elt.exp2 = call <4 x float> @llvm.exp2.v4f32(
 // NO_HALF: ret <4 x float> %elt.exp2
 half4 test_exp2_half4(half4 p0) { return exp2(p0); }
 
-// CHECK: define noundef float @
+// CHECK-LABEL: define noundef float @_Z15test_exp2_float
 // CHECK: %elt.exp2 = call float @llvm.exp2.f32(
 // CHECK: ret float %elt.exp2
 float test_exp2_float(float p0) { return exp2(p0); }
-// CHECK: define noundef <2 x float> @
+// CHECK-LABEL: define noundef <2 x float> @_Z16test_exp2_float2
 // CHECK: %elt.exp2 = call <2 x float> @llvm.exp2.v2f32
 // CHECK: ret <2 x float> %elt.exp2
 float2 test_exp2_float2(float2 p0) { return exp2(p0); }
-// CHECK: define noundef <3 x float> @
+// CHECK-LABEL: define noundef <3 x float> @_Z16test_exp2_float3
 // CHECK: %elt.exp2 = call <3 x float> @llvm.exp2.v3f32
 // CHECK: ret <3 x float> %elt.exp2
 float3 test_exp2_float3(float3 p0) { return exp2(p0); }
-// CHECK: define noundef <4 x float> @
+// CHECK-LABEL: define noundef <4 x float> @_Z16test_exp2_float4
 // CHECK: %elt.exp2 = call <4 x float> @llvm.exp2.v4f32
 // CHECK: ret <4 x float> %elt.exp2
 float4 test_exp2_float4(float4 p0) { return exp2(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/floor.hlsl b/clang/test/CodeGenHLSL/builtins/floor.hlsl
index 48ddf713bcf504..c2d6f1bcc335c9 100644
--- a/clang/test/CodeGenHLSL/builtins/floor.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/floor.hlsl
@@ -1,43 +1,42 @@
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
-// RUN:   --check-prefixes=CHECK,NATIVE_HALF
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
 using hlsl::floor;
 
-// NATIVE_HALF: define noundef half @
+// NATIVE_HALF-LABEL: define noundef half @_Z15test_floor_half
 // NATIVE_HALF: call half @llvm.floor.f16(
-// NO_HALF: define noundef float @"?test_floor_half@@YA$halff@$halff@@Z"(
+// NO_HALF-LABEL: define noundef float @_Z15test_floor_half
 // NO_HALF: call float @llvm.floor.f32(float %0)
 half test_floor_half(half p0) { return floor(p0); }
-// NATIVE_HALF: define noundef <2 x half> @
+// NATIVE_HALF-LABEL: define noundef <2 x half> @_Z16test_floor_half2
 // NATIVE_HALF: call <2 x half> @llvm.floor.v2f16(
-// NO_HALF: define noundef <2 x float> @"?test_floor_half2@@YAT?$__vector@$halff@$01@__clang@@T12@@Z"(
+// NO_HALF-LABEL: define noundef <2 x float> @_Z16test_floor_half2
 // NO_HALF: call <2 x float> @llvm.floor.v2f32(
 half2 test_floor_half2(half2 p0) { return floor(p0); }
-// NATIVE_HALF: define noundef <3 x half> @
+// NATIVE_HALF-LABEL: define noundef <3 x half> @_Z16test_floor_half3
 // NATIVE_HALF: call <3 x half> @llvm.floor.v3f16(
-// NO_HALF: define noundef <3 x float> @"?test_floor_half3@@YAT?$__vector@$halff@$02@__clang@@T12@@Z"(
+// NO_HALF-LABEL: define noundef <3 x float> @_Z16test_floor_half3
 // NO_HALF: call <3 x float> @llvm.floor.v3f32(
 half3 test_floor_half3(half3 p0) { return floor(p0); }
-// NATIVE_HALF: define noundef <4 x half> @
+// NATIVE_HALF-LABEL: define noundef <4 x half> @_Z16test_floor_half4
 // NATIVE_HALF: call <4 x half> @llvm.floor.v4f16(
-// NO_HALF: define noundef <4 x float> @"?test_floor_half4@@YAT?$__vector@$halff@$03@__clang@@T12@@Z"(
+// NO_HALF-LABEL: define noundef <4 x float> @_Z16test_floor_half4
 // NO_HALF: call <4 x float> @llvm.floor.v4f32(
 half4 test_floor_half4(half4 p0) { return floor(p0); }
 
-// CHECK: define noundef float @
+// CHECK-LABEL: define noundef float @_Z16test_floor_float
 // CHECK: call float @llvm.floor.f32(
 float test_floor_float(float p0) { return floor(p0); }
-// CHECK: define noundef <2 x float> @
+// CHECK-LABEL: define noundef <2 x float> @_Z17test_floor_float2
 // CHECK: call <2 x float> @llvm.floor.v2f32(
 float2 test_floor_float2(float2 p0) { return floor(p0); }
-// CHECK: define noundef <3 x float> @
+// CHECK-LABEL: define noundef <3 x float> @_Z17test_floor_float3
 // CHECK: call <3 x float> @llvm.floor.v3f32(
 float3 test_floor_float3(float3 p0) { return floor(p0); }
-// CHECK: define noundef <4 x float> @
+// CHECK-LABEL: define noundef <4 x float> @_Z17test_floor_float4
 // CHECK: call <4 x float> @llvm.floor.v4f32(
 float4 test_floor_float4(float4 p0) { return floor(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/hlsl_resource_t.hlsl b/clang/test/CodeGenHLSL/builtins/hlsl_resource_t.hlsl
index ce973309034781..e735a85b589f87 100644
--- a/clang/test/CodeGenHLSL/builtins/hlsl_resource_t.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/hlsl_resource_t.hlsl
@@ -2,8 +2,8 @@
 
 void foo(__hlsl_resource_t res);
 
-// CHECK: define void @"?bar@@YAXU__hlsl_resource_t@@@Z"(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %[[PARAM:[a-zA-Z0-9]+]])
-// CHECK: call void @"?foo@@YAXU__hlsl_resource_t@@@Z"(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %[[PARAM]])
+// CHECK: define void @_Z3baru17__hlsl_resource_t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %[[PARAM:[a-zA-Z0-9]+]])
+// CHECK: call void @_Z3foou17__hlsl_resource_t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %[[PARAM]])
 void bar(__hlsl_resource_t a) {
     foo(a);
 }
diff --git a/clang/test/CodeGenHLSL/builtins/log.hlsl b/clang/test/CodeGenHLSL/builtins/log.hlsl
index c89eda683403b4..71ce502eb8c4a8 100644
--- a/clang/test/CodeGenHLSL/builtins/log.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/log.hlsl
@@ -1,41 +1,40 @@
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
-// RUN:   --check-prefixes=CHECK,NATIVE_HALF
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF: define noundef half @
+// NATIVE_HALF-LABEL: define noundef half @_Z13test_log_half
 // NATIVE_HALF: call half @llvm.log.f16(
-// NO_HALF: define noundef float @"?test_log_half@@YA$halff@$halff@@Z"(
+// NO_HALF-LABEL: define noundef float @_Z13test_log_half
 // NO_HALF: call float @llvm.log.f32(
 half test_log_half(half p0) { return log(p0); }
-// NATIVE_HALF: define noundef <2 x half> @
+// NATIVE_HALF-LABEL: define noundef <2 x half> @_Z14test_log_half2
 // NATIVE_HALF: call <2 x half> @llvm.log.v2f16
-// NO_HALF: define noundef <2 x float> @"?test_log_half2
+// NO_HALF-LABEL: define noundef <2 x float> @_Z14test_log_half2
 // NO_HALF: call <2 x float> @llvm.log.v2f32(
 half2 test_log_half2(half2 p0) { return log(p0); }
-// NATIVE_HALF: define noundef <3 x half> @
+// NATIVE_HALF-LABEL: define noundef <3 x half> @_Z14test_log_half3
 // NATIVE_HALF: call <3 x half> @llvm.log.v3f16
-// NO_HALF: define noundef <3 x float> @"?test_log_half3
+// NO_HALF-LABEL: define noundef <3 x float> @_Z14test_log_half3
 // NO_HALF: call <3 x float> @llvm.log.v3f32(
 half3 test_log_half3(half3 p0) { return log(p0); }
-// NATIVE_HALF: define noundef <4 x half> @
+// NATIVE_HALF-LABEL: define noundef <4 x half> @_Z14test_log_half4
 // NATIVE_HALF: call <4 x half> @llvm.log.v4f16
-// NO_HALF: define noundef <4 x float> @"?test_log_half4
+// NO_HALF-LABEL: define noundef <4 x float> @_Z14test_log_half4
 // NO_HALF: call <4 x float> @llvm.log.v4f32(
 half4 test_log_half4(half4 p0) { return log(p0); }
 
-// CHECK: define noundef float @"?test_log_float
+// CHECK-LABEL: define noundef float @_Z14test_log_float
 // CHECK: call float @llvm.log.f32(
 float test_log_float(float p0) { return log(p0); }
-// CHECK: define noundef <2 x float> @"?test_log_float2
+// CHECK-LABEL: define noundef <2 x float> @_Z15test_log_float2
 // CHECK: call <2 x float> @llvm.log.v2f32
 float2 test_log_float2(float2 p0) { return log(p0); }
-// CHECK: define noundef <3 x float> @"?test_log_float3
+// CHECK-LABEL: define noundef <3 x float> @_Z15test_log_float3
 // CHECK: call <3 x float> @llvm.log.v3f32
 float3 test_log_float3(float3 p0) { return log(p0); }
-// CHECK: define noundef <4 x float> @"?test_log_float4
+// CHECK-LABEL: define noundef <4 x float> @_Z15test_log_float4
 // CHECK: call <4 x float> @llvm.log.v4f32
 float4 test_log_float4(float4 p0) { return log(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/log10.hlsl b/clang/test/CodeGenHLSL/builtins/log10.hlsl
index 638b86e8d5eaf7..e15b6f5747b0a8 100644
--- a/clang/test/CodeGenHLSL/builtins/log10.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/log10.hlsl
@@ -1,41 +1,40 @@
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
-// RUN:   --check-prefixes=CHECK,NATIVE_HALF
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF: define noundef half @
+// NATIVE_HALF-LABEL: define noundef half @_Z15test_log10_half
 // NATIVE_HALF: call half @llvm.log10.f16(
-// NO_HALF: define noundef float @"?test_log10_half
+// NO_HALF-LABEL: define noundef float @_Z15test_log10_half
 // NO_HALF: call float @llvm.log10.f32(
 half test_log10_half(half p0) { return log10(p0); }
-// NATIVE_HALF: define noundef <2 x half> @
+// NATIVE_HALF-LABEL: define noundef <2 x half> @_Z16test_log10_half2
 // NATIVE_HALF: call <2 x half> @llvm.log10.v2f16
-// NO_HALF: define noundef <2 x float> @"?test_log10_half2
+// NO_HALF-LABEL: define noundef <2 x float> @_Z16test_log10_half2
 // NO_HALF: call <2 x float> @llvm.log10.v2f32(
 half2 test_log10_half2(half2 p0) { return log10(p0); }
-// NATIVE_HALF: define noundef <3 x half> @
+// NATIVE_HALF-LABEL: define noundef <3 x half> @_Z16test_log10_half3
 // NATIVE_HALF: call <3 x half> @llvm.log10.v3f16
-// NO_HALF: define noundef <3 x float> @"?test_log10_half3
+// NO_HALF-LABEL: define noundef <3 x float> @_Z16test_log10_half3
 // NO_HALF: call <3 x float> @llvm.log10.v3f32(
 half3 test_log10_half3(half3 p0) { return log10(p0); }
-// NATIVE_HALF: define noundef <4 x half> @
+// NATIVE_HALF-LABEL: define noundef <4 x half> @_Z16test_log10_half4
 // NATIVE_HALF: call <4 x half> @llvm.log10.v4f16
-// NO_HALF: define noundef <4 x float> @"?test_log10_half4
+// NO_HALF-LABEL: define noundef <4 x float> @_Z16test_log10_half4
 // NO_HALF: call <4 x float> @llvm.log10.v4f32(
 half4 test_log10_half4(half4 p0) { return log10(p0); }
 
-// CHECK: define noundef float @"?test_log10_float
+// CHECK-LABEL: define noundef float @_Z16test_log10_float
 // CHECK: call float @llvm.log10.f32(
 float test_log10_float(float p0) { return log10(p0); }
-// CHECK: define noundef <2 x float> @"?test_log10_float2
+// CHECK-LABEL: define noundef <2 x float> @_Z17test_log10_float2
 // CHECK: call <2 x float> @llvm.log10.v2f32
 float2 test_log10_float2(float2 p0) { return log10(p0); }
-// CHECK: define noundef <3 x float> @"?test_log10_float3
+// CHECK-LABEL: define noundef <3 x float> @_Z17test_log10_float3
 // CHECK: call <3 x float> @llvm.log10.v3f32
 float3 test_log10_float3(float3 p0) { return log10(p0); }
-// CHECK: define noundef <4 x float> @"?test_log10_float4
+// CHECK-LABEL: define noundef <4 x float> @_Z17test_log10_float4
 // CHECK: call <4 x float> @llvm.log10.v4f32
 float4 test_log10_float4(float4 p0) { return log10(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/log2.hlsl b/clang/test/CodeGenHLSL/builtins/log2.hlsl
index 31c7bff214c61f..575761a5f637c0 100644
--- a/clang/test/CodeGenHLSL/builtins/log2.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/log2.hlsl
@@ -1,41 +1,40 @@
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
-// RUN:   --check-prefixes=CHECK,NATIVE_HALF
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF: define noundef half @
+// NATIVE_HALF-LABEL: define noundef half @_Z14test_log2_half
 // NATIVE_HALF: call half @llvm.log2.f16(
-// NO_HALF: define noundef float @"?test_log2_half
+// NO_HALF-LABEL: define noundef float @_Z14test_log2_half
 // NO_HALF: call float @llvm.log2.f32(
 half test_log2_half(half p0) { return log2(p0); }
-// NATIVE_HALF: define noundef <2 x half> @
+// NATIVE_HALF-LABEL: define noundef <2 x half> @_Z15test_log2_half2
 // NATIVE_HALF: call <2 x half> @llvm.log2.v2f16
-// NO_HALF: define noundef <2 x float> @"?test_log2_half2
+// NO_HALF-LABEL: define noundef <2 x float> @_Z15test_log2_half2
 // NO_HALF: call <2 x float> @llvm.log2.v2f32(
 half2 test_log2_half2(half2 p0) { return log2(p0); }
-// NATIVE_HALF: define noundef <3 x half> @
+// NATIVE_HALF-LABEL: define noundef <3 x half> @_Z15test_log2_half3
 // NATIVE_HALF: call <3 x half> @llvm.log2.v3f16
-// NO_HALF: define noundef <3 x float> @"?test_log2_half3
+// NO_HALF-LABEL: define noundef <3 x float> @_Z15test_log2_half3
 // NO_HALF: call <3 x float> @llvm.log2.v3f32(
 half3 test_log2_half3(half3 p0) { return log2(p0); }
-// NATIVE_HALF: define noundef <4 x half> @
+// NATIVE_HALF-LABEL: define noundef <4 x half> @_Z15test_log2_half4
 // NATIVE_HALF: call <4 x half> @llvm.log2.v4f16
-// NO_HALF: define noundef <4 x float> @"?test_log2_half4
+// NO_HALF-LABEL: define noundef <4 x float> @_Z15test_log2_half4
 // NO_HALF: call <4 x float> @llvm.log2.v4f32(
 half4 test_log2_half4(half4 p0) { return log2(p0); }
 
-// CHECK: define noundef float @"?test_log2_float
+// CHECK-LABEL: define noundef float @_Z15test_log2_float
 // CHECK: call float @llvm.log2.f32(
 float test_log2_float(float p0) { return log2(p0); }
-// CHECK: define noundef <2 x float> @"?test_log2_float2
+// CHECK-LABEL: define noundef <2 x float> @_Z16test_log2_float2
 // CHECK: call <2 x float> @llvm.log2.v2f32
 float2 test_log2_float2(float2 p0) { return log2(p0); }
-// CHECK: define noundef <3 x float> @"?test_log2_float3
+// CHECK-LABEL: define noundef <3 x float> @_Z16test_log2_float3
 // CHECK: call <3 x float> @llvm.log2.v3f32
 float3 test_log2_float3(float3 p0) { return log2(p0); }
-// CHECK: define noundef <4 x float> @"?test_log2_float4
+// CHECK-LABEL: define noundef <4 x float> @_Z16test_log2_float4
 // CHECK: call <4 x float> @llvm.log2.v4f32
 float4 test_log2_float4(float4 p0) { return log2(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/max.hlsl b/clang/test/CodeGenHLSL/builtins/max.hlsl
index f17062f7bb0115..d462fda2ccb09f 100644
--- a/clang/test/CodeGenHLSL/builtins/max.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/max.hlsl
@@ -1,134 +1,133 @@
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
-// RUN:   --check-prefixes=CHECK,NATIVE_HALF
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
 #ifdef __HLSL_ENABLE_16_BIT
-// NATIVE_HALF: define noundef i16 @
+// NATIVE_HALF-LABEL: define noundef i16 @_Z14test_max_short
 // NATIVE_HALF: call i16 @llvm.smax.i16(
 int16_t test_max_short(int16_t p0, int16_t p1) { return max(p0, p1); }
-// NATIVE_HALF: define noundef <2 x i16> @
+// NATIVE_HALF-LABEL: define noundef <2 x i16> @_Z15test_max_short2
 // NATIVE_HALF: call <2 x i16> @llvm.smax.v2i16(
 int16_t2 test_max_short2(int16_t2 p0, int16_t2 p1) { return max(p0, p1); }
-// NATIVE_HALF: define noundef <3 x i16> @
+// NATIVE_HALF-LABEL: define noundef <3 x i16> @_Z15test_max_short3
 // NATIVE_HALF: call <3 x i16> @llvm.smax.v3i16
 int16_t3 test_max_short3(int16_t3 p0, int16_t3 p1) { return max(p0, p1); }
-// NATIVE_HALF: define noundef <4 x i16> @
+// NATIVE_HALF-LABEL: define noundef <4 x i16> @_Z15test_max_short4
 // NATIVE_HALF: call <4 x i16> @llvm.smax.v4i16
 int16_t4 test_max_short4(int16_t4 p0, int16_t4 p1) { return max(p0, p1); }
 
-// NATIVE_HALF: define noundef i16 @
+// NATIVE_HALF-LABEL: define noundef i16 @_Z15test_max_ushort
 // NATIVE_HALF: call i16 @llvm.umax.i16(
 uint16_t test_max_ushort(uint16_t p0, uint16_t p1) { return max(p0, p1); }
-// NATIVE_HALF: define noundef <2 x i16> @
+// NATIVE_HALF-LABEL: define noundef <2 x i16> @_Z16test_max_ushort2
 // NATIVE_HALF: call <2 x i16> @llvm.umax.v2i16
 uint16_t2 test_max_ushort2(uint16_t2 p0, uint16_t2 p1) { return max(p0, p1); }
-// NATIVE_HALF: define noundef <3 x i16> @
+// NATIVE_HALF-LABEL: define noundef <3 x i16> @_Z16test_max_ushort3
 // NATIVE_HALF: call <3 x i16> @llvm.umax.v3i16
 uint16_t3 test_max_ushort3(uint16_t3 p0, uint16_t3 p1) { return max(p0, p1); }
-// NATIVE_HALF: define noundef <4 x i16> @
+// NATIVE_HALF-LABEL: define noundef <4 x i16> @_Z16test_max_ushort4
 // NATIVE_HALF: call <4 x i16> @llvm.umax.v4i16
 uint16_t4 test_max_ushort4(uint16_t4 p0, uint16_t4 p1) { return max(p0, p1); }
 #endif
 
-// CHECK: define noundef i32 @
+// CHECK-LABEL: define noundef i32 @_Z12test_max_int
 // CHECK: call i32 @llvm.smax.i32(
 int test_max_int(int p0, int p1) { return max(p0, p1); }
-// CHECK: define noundef <2 x i32> @
+// CHECK-LABEL: define noundef <2 x i32> @_Z13test_max_int2
 // CHECK: call <2 x i32> @llvm.smax.v2i32
 int2 test_max_int2(int2 p0, int2 p1) { return max(p0, p1); }
-// CHECK: define noundef <3 x i32> @
+// CHECK-LABEL: define noundef <3 x i32> @_Z13test_max_int3
 // CHECK: call <3 x i32> @llvm.smax.v3i32
 int3 test_max_int3(int3 p0, int3 p1) { return max(p0, p1); }
-// CHECK: define noundef <4 x i32> @
+// CHECK-LABEL: define noundef <4 x i32> @_Z13test_max_int4
 // CHECK: call <4 x i32> @llvm.smax.v4i32
 int4 test_max_int4(int4 p0, int4 p1) { return max(p0, p1); }
 
-// CHECK: define noundef i32 @
+// CHECK-LABEL: define noundef i32 @_Z13test_max_uint
 // CHECK: call i32 @llvm.umax.i32(
 int test_max_uint(uint p0, uint p1) { return max(p0, p1); }
-// CHECK: define noundef <2 x i32> @
+// CHECK-LABEL: define noundef <2 x i32> @_Z14test_max_uint2
 // CHECK: call <2 x i32> @llvm.umax.v2i32
 uint2 test_max_uint2(uint2 p0, uint2 p1) { return max(p0, p1); }
-// CHECK: define noundef <3 x i32> @
+// CHECK-LABEL: define noundef <3 x i32> @_Z14test_max_uint3
 // CHECK: call <3 x i32> @llvm.umax.v3i32
 uint3 test_max_uint3(uint3 p0, uint3 p1) { return max(p0, p1); }
-// CHECK: define noundef <4 x i32> @
+// CHECK-LABEL: define noundef <4 x i32> @_Z14test_max_uint4
 // CHECK: call <4 x i32> @llvm.umax.v4i32
 uint4 test_max_uint4(uint4 p0, uint4 p1) { return max(p0, p1); }
 
-// CHECK: define noundef i64 @
+// CHECK-LABEL: define noundef i64 @_Z13test_max_long
 // CHECK: call i64 @llvm.smax.i64(
 int64_t test_max_long(int64_t p0, int64_t p1) { return max(p0, p1); }
-// CHECK: define noundef <2 x i64> @
+// CHECK-LABEL: define noundef <2 x i64> @_Z14test_max_long2
 // CHECK: call <2 x i64> @llvm.smax.v2i64
 int64_t2 test_max_long2(int64_t2 p0, int64_t2 p1) { return max(p0, p1); }
-// CHECK: define noundef <3 x i64> @
+// CHECK-LABEL: define noundef <3 x i64> @_Z14test_max_long3
 // CHECK: call <3 x i64> @llvm.smax.v3i64
 int64_t3 test_max_long3(int64_t3 p0, int64_t3 p1) { return max(p0, p1); }
-// CHECK: define noundef <4 x i64> @
+// CHECK-LABEL: define noundef <4 x i64> @_Z14test_max_long4
 // CHECK: call <4 x i64> @llvm.smax.v4i64
 int64_t4 test_max_long4(int64_t4 p0, int64_t4 p1) { return max(p0, p1); }
 
-// CHECK: define noundef i64 @
+// CHECK-LABEL: define noundef i64 @_Z14test_max_ulong
 // CHECK: call i64 @llvm.umax.i64(
-uint64_t test_max_long(uint64_t p0, uint64_t p1) { return max(p0, p1); }
-// CHECK: define noundef <2 x i64> @
+uint64_t test_max_ulong(uint64_t p0, uint64_t p1) { return max(p0, p1); }
+// CHECK-LABEL: define noundef <2 x i64> @_Z15test_max_ulong2
 // CHECK: call <2 x i64> @llvm.umax.v2i64
-uint64_t2 test_max_long2(uint64_t2 p0, uint64_t2 p1) { return max(p0, p1); }
-// CHECK: define noundef <3 x i64> @
+uint64_t2 test_max_ulong2(uint64_t2 p0, uint64_t2 p1) { return max(p0, p1); }
+// CHECK-LABEL: define noundef <3 x i64> @_Z15test_max_ulong3
 // CHECK: call <3 x i64> @llvm.umax.v3i64
-uint64_t3 test_max_long3(uint64_t3 p0, uint64_t3 p1) { return max(p0, p1); }
-// CHECK: define noundef <4 x i64> @
+uint64_t3 test_max_ulong3(uint64_t3 p0, uint64_t3 p1) { return max(p0, p1); }
+// CHECK-LABEL: define noundef <4 x i64> @_Z15test_max_ulong4
 // CHECK: call <4 x i64> @llvm.umax.v4i64
-uint64_t4 test_max_long4(uint64_t4 p0, uint64_t4 p1) { return max(p0, p1); }
+uint64_t4 test_max_ulong4(uint64_t4 p0, uint64_t4 p1) { return max(p0, p1); }
 
-// NATIVE_HALF: define noundef half @
+// NATIVE_HALF-LABEL: define noundef half @_Z13test_max_half
 // NATIVE_HALF: call half @llvm.maxnum.f16(
-// NO_HALF: define noundef float @"?test_max_half
+// NO_HALF-LABEL: define noundef float @_Z13test_max_half
 // NO_HALF: call float @llvm.maxnum.f32(
 half test_max_half(half p0, half p1) { return max(p0, p1); }
-// NATIVE_HALF: define noundef <2 x half> @
+// NATIVE_HALF-LABEL: define noundef <2 x half> @_Z14test_max_half2
 // NATIVE_HALF: call <2 x half> @llvm.maxnum.v2f16
-// NO_HALF: define noundef <2 x float> @"?test_max_half2
+// NO_HALF-LABEL: define noundef <2 x float> @_Z14test_max_half2
 // NO_HALF: call <2 x float> @llvm.maxnum.v2f32(
 half2 test_max_half2(half2 p0, half2 p1) { return max(p0, p1); }
-// NATIVE_HALF: define noundef <3 x half> @
+// NATIVE_HALF-LABEL: define noundef <3 x half> @_Z14test_max_half3
 // NATIVE_HALF: call <3 x half> @llvm.maxnum.v3f16
-// NO_HALF: define noundef <3 x float> @"?test_max_half3
+// NO_HALF-LABEL: define noundef <3 x float> @_Z14test_max_half3
 // NO_HALF: call <3 x float> @llvm.maxnum.v3f32(
 half3 test_max_half3(half3 p0, half3 p1) { return max(p0, p1); }
-// NATIVE_HALF: define noundef <4 x half> @
+// NATIVE_HALF-LABEL: define noundef <4 x half> @_Z14test_max_half4
 // NATIVE_HALF: call <4 x half> @llvm.maxnum.v4f16
-// NO_HALF: define noundef <4 x float> @"?test_max_half4
+// NO_HALF-LABEL: define noundef <4 x float> @_Z14test_max_half4
 // NO_HALF: call <4 x float> @llvm.maxnum.v4f32(
 half4 test_max_half4(half4 p0, half4 p1) { return max(p0, p1); }
 
-// CHECK: define noundef float @"?test_max_float
+// CHECK-LABEL: define noundef float @_Z14test_max_float
 // CHECK: call float @llvm.maxnum.f32(
 float test_max_float(float p0, float p1) { return max(p0, p1); }
-// CHECK: define noundef <2 x float> @"?test_max_float2
+// CHECK-LABEL: define noundef <2 x float> @_Z15test_max_float2
 // CHECK: call <2 x float> @llvm.maxnum.v2f32
 float2 test_max_float2(float2 p0, float2 p1) { return max(p0, p1); }
-// CHECK: define noundef <3 x float> @"?test_max_float3
+// CHECK-LABEL: define noundef <3 x float> @_Z15test_max_float3
 // CHECK: call <3 x float> @llvm.maxnum.v3f32
 float3 test_max_float3(float3 p0, float3 p1) { return max(p0, p1); }
-// CHECK: define noundef <4 x float> @"?test_max_float4
+// CHECK-LABEL: define noundef <4 x float> @_Z15test_max_float4
 // CHECK: call <4 x float> @llvm.maxnum.v4f32
 float4 test_max_float4(float4 p0, float4 p1) { return max(p0, p1); }
 
-// CHECK: define noundef double @
+// CHECK-LABEL: define noundef double @_Z15test_max_double
 // CHECK: call double @llvm.maxnum.f64(
 double test_max_double(double p0, double p1) { return max(p0, p1); }
-// CHECK: define noundef <2 x double> @
+// CHECK-LABEL: define noundef <2 x double> @_Z16test_max_double2
 // CHECK: call <2 x double> @llvm.maxnum.v2f64
 double2 test_max_double2(double2 p0, double2 p1) { return max(p0, p1); }
-// CHECK: define noundef <3 x double> @
+// CHECK-LABEL: define noundef <3 x double> @_Z16test_max_double3
 // CHECK: call <3 x double> @llvm.maxnum.v3f64
 double3 test_max_double3(double3 p0, double3 p1) { return max(p0, p1); }
-// CHECK: define noundef <4 x double> @
+// CHECK-LABEL: define noundef <4 x double> @_Z16test_max_double4
 // CHECK: call <4 x double> @llvm.maxnum.v4f64
 double4 test_max_double4(double4 p0, double4 p1) { return max(p0, p1); }
diff --git a/clang/test/CodeGenHLSL/builtins/min.hlsl b/clang/test/CodeGenHLSL/builtins/min.hlsl
index a0c233dac4d5fc..02d20d13f916de 100644
--- a/clang/test/CodeGenHLSL/builtins/min.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/min.hlsl
@@ -1,134 +1,133 @@
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
-// RUN:   --check-prefixes=CHECK,NATIVE_HALF
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
 #ifdef __HLSL_ENABLE_16_BIT
-// NATIVE_HALF: define noundef i16 @
+// NATIVE_HALF-LABEL: define noundef i16 @_Z14test_min_short
 // NATIVE_HALF: call i16 @llvm.smin.i16(
 int16_t test_min_short(int16_t p0, int16_t p1) { return min(p0, p1); }
-// NATIVE_HALF: define noundef <2 x i16> @
+// NATIVE_HALF-LABEL: define noundef <2 x i16> @_Z15test_min_short2
 // NATIVE_HALF: call <2 x i16> @llvm.smin.v2i16(
 int16_t2 test_min_short2(int16_t2 p0, int16_t2 p1) { return min(p0, p1); }
-// NATIVE_HALF: define noundef <3 x i16> @
+// NATIVE_HALF-LABEL: define noundef <3 x i16> @_Z15test_min_short3
 // NATIVE_HALF: call <3 x i16> @llvm.smin.v3i16
 int16_t3 test_min_short3(int16_t3 p0, int16_t3 p1) { return min(p0, p1); }
-// NATIVE_HALF: define noundef <4 x i16> @
+// NATIVE_HALF-LABEL: define noundef <4 x i16> @_Z15test_min_short4
 // NATIVE_HALF: call <4 x i16> @llvm.smin.v4i16
 int16_t4 test_min_short4(int16_t4 p0, int16_t4 p1) { return min(p0, p1); }
 
-// NATIVE_HALF: define noundef i16 @
+// NATIVE_HALF-LABEL: define noundef i16 @_Z15test_min_ushort
 // NATIVE_HALF: call i16 @llvm.umin.i16(
 uint16_t test_min_ushort(uint16_t p0, uint16_t p1) { return min(p0, p1); }
-// NATIVE_HALF: define noundef <2 x i16> @
+// NATIVE_HALF-LABEL: define noundef <2 x i16> @_Z16test_min_ushort2
 // NATIVE_HALF: call <2 x i16> @llvm.umin.v2i16
 uint16_t2 test_min_ushort2(uint16_t2 p0, uint16_t2 p1) { return min(p0, p1); }
-// NATIVE_HALF: define noundef <3 x i16> @
+// NATIVE_HALF-LABEL: define noundef <3 x i16> @_Z16test_min_ushort3
 // NATIVE_HALF: call <3 x i16> @llvm.umin.v3i16
 uint16_t3 test_min_ushort3(uint16_t3 p0, uint16_t3 p1) { return min(p0, p1); }
-// NATIVE_HALF: define noundef <4 x i16> @
+// NATIVE_HALF-LABEL: define noundef <4 x i16> @_Z16test_min_ushort4
 // NATIVE_HALF: call <4 x i16> @llvm.umin.v4i16
 uint16_t4 test_min_ushort4(uint16_t4 p0, uint16_t4 p1) { return min(p0, p1); }
 #endif
 
-// CHECK: define noundef i32 @
+// CHECK-LABEL: define noundef i32 @_Z12test_min_int
 // CHECK: call i32 @llvm.smin.i32(
 int test_min_int(int p0, int p1) { return min(p0, p1); }
-// CHECK: define noundef <2 x i32> @
+// CHECK-LABEL: define noundef <2 x i32> @_Z13test_min_int2
 // CHECK: call <2 x i32> @llvm.smin.v2i32
 int2 test_min_int2(int2 p0, int2 p1) { return min(p0, p1); }
-// CHECK: define noundef <3 x i32> @
+// CHECK-LABEL: define noundef <3 x i32> @_Z13test_min_int3
 // CHECK: call <3 x i32> @llvm.smin.v3i32
 int3 test_min_int3(int3 p0, int3 p1) { return min(p0, p1); }
-// CHECK: define noundef <4 x i32> @
+// CHECK-LABEL: define noundef <4 x i32> @_Z13test_min_int4
 // CHECK: call <4 x i32> @llvm.smin.v4i32
 int4 test_min_int4(int4 p0, int4 p1) { return min(p0, p1); }
 
-// CHECK: define noundef i32 @
+// CHECK-LABEL: define noundef i32 @_Z13test_min_uint
 // CHECK: call i32 @llvm.umin.i32(
 int test_min_uint(uint p0, uint p1) { return min(p0, p1); }
-// CHECK: define noundef <2 x i32> @
+// CHECK-LABEL: define noundef <2 x i32> @_Z14test_min_uint2
 // CHECK: call <2 x i32> @llvm.umin.v2i32
 uint2 test_min_uint2(uint2 p0, uint2 p1) { return min(p0, p1); }
-// CHECK: define noundef <3 x i32> @
+// CHECK-LABEL: define noundef <3 x i32> @_Z14test_min_uint3
 // CHECK: call <3 x i32> @llvm.umin.v3i32
 uint3 test_min_uint3(uint3 p0, uint3 p1) { return min(p0, p1); }
-// CHECK: define noundef <4 x i32> @
+// CHECK-LABEL: define noundef <4 x i32> @_Z14test_min_uint4
 // CHECK: call <4 x i32> @llvm.umin.v4i32
 uint4 test_min_uint4(uint4 p0, uint4 p1) { return min(p0, p1); }
 
-// CHECK: define noundef i64 @
+// CHECK-LABEL: define noundef i64 @_Z13test_min_long
 // CHECK: call i64 @llvm.smin.i64(
 int64_t test_min_long(int64_t p0, int64_t p1) { return min(p0, p1); }
-// CHECK: define noundef <2 x i64> @
+// CHECK-LABEL: define noundef <2 x i64> @_Z14test_min_long2
 // CHECK: call <2 x i64> @llvm.smin.v2i64
 int64_t2 test_min_long2(int64_t2 p0, int64_t2 p1) { return min(p0, p1); }
-// CHECK: define noundef <3 x i64> @
+// CHECK-LABEL: define noundef <3 x i64> @_Z14test_min_long3
 // CHECK: call <3 x i64> @llvm.smin.v3i64
 int64_t3 test_min_long3(int64_t3 p0, int64_t3 p1) { return min(p0, p1); }
-// CHECK: define noundef <4 x i64> @
+// CHECK-LABEL: define noundef <4 x i64> @_Z14test_min_long4
 // CHECK: call <4 x i64> @llvm.smin.v4i64
 int64_t4 test_min_long4(int64_t4 p0, int64_t4 p1) { return min(p0, p1); }
 
-// CHECK: define noundef i64 @
+// CHECK-LABEL: define noundef i64 @_Z14test_min_ulong
 // CHECK: call i64 @llvm.umin.i64(
-uint64_t test_min_long(uint64_t p0, uint64_t p1) { return min(p0, p1); }
-// CHECK: define noundef <2 x i64> @
+uint64_t test_min_ulong(uint64_t p0, uint64_t p1) { return min(p0, p1); }
+// CHECK-LABEL: define noundef <2 x i64> @_Z15test_min_ulong2
 // CHECK: call <2 x i64> @llvm.umin.v2i64
-uint64_t2 test_min_long2(uint64_t2 p0, uint64_t2 p1) { return min(p0, p1); }
-// CHECK: define noundef <3 x i64> @
+uint64_t2 test_min_ulong2(uint64_t2 p0, uint64_t2 p1) { return min(p0, p1); }
+// CHECK-LABEL: define noundef <3 x i64> @_Z15test_min_ulong3
 // CHECK: call <3 x i64> @llvm.umin.v3i64
-uint64_t3 test_min_long3(uint64_t3 p0, uint64_t3 p1) { return min(p0, p1); }
-// CHECK: define noundef <4 x i64> @
+uint64_t3 test_min_ulong3(uint64_t3 p0, uint64_t3 p1) { return min(p0, p1); }
+// CHECK-LABEL: define noundef <4 x i64> @_Z15test_min_ulong4
 // CHECK: call <4 x i64> @llvm.umin.v4i64
-uint64_t4 test_min_long4(uint64_t4 p0, uint64_t4 p1) { return min(p0, p1); }
+uint64_t4 test_min_ulong4(uint64_t4 p0, uint64_t4 p1) { return min(p0, p1); }
 
-// NATIVE_HALF: define noundef half @
+// NATIVE_HALF-LABEL: define noundef half @_Z13test_min_half
 // NATIVE_HALF: call half @llvm.minnum.f16(
-// NO_HALF: define noundef float @"?test_min_half
+// NO_HALF-LABEL: define noundef float @_Z13test_min_half
 // NO_HALF: call float @llvm.minnum.f32(
 half test_min_half(half p0, half p1) { return min(p0, p1); }
-// NATIVE_HALF: define noundef <2 x half> @
+// NATIVE_HALF-LABEL: define noundef <2 x half> @_Z14test_min_half2
 // NATIVE_HALF: call <2 x half> @llvm.minnum.v2f16
-// NO_HALF: define noundef <2 x float> @"?test_min_half2
+// NO_HALF-LABEL: define noundef <2 x float> @_Z14test_min_half2
 // NO_HALF: call <2 x float> @llvm.minnum.v2f32(
 half2 test_min_half2(half2 p0, half2 p1) { return min(p0, p1); }
-// NATIVE_HALF: define noundef <3 x half> @
+// NATIVE_HALF-LABEL: define noundef <3 x half> @_Z14test_min_half3
 // NATIVE_HALF: call <3 x half> @llvm.minnum.v3f16
-// NO_HALF: define noundef <3 x float> @"?test_min_half3
+// NO_HALF-LABEL: define noundef <3 x float> @_Z14test_min_half3
 // NO_HALF: call <3 x float> @llvm.minnum.v3f32(
 half3 test_min_half3(half3 p0, half3 p1) { return min(p0, p1); }
-// NATIVE_HALF: define noundef <4 x half> @
+// NATIVE_HALF-LABEL: define noundef <4 x half> @_Z14test_min_half4
 // NATIVE_HALF: call <4 x half> @llvm.minnum.v4f16
-// NO_HALF: define noundef <4 x float> @"?test_min_half4
+// NO_HALF-LABEL: define noundef <4 x float> @_Z14test_min_half4
 // NO_HALF: call <4 x float> @llvm.minnum.v4f32(
 half4 test_min_half4(half4 p0, half4 p1) { return min(p0, p1); }
 
-// CHECK: define noundef float @
+// CHECK-LABEL: define noundef float @_Z14test_min_float
 // CHECK: call float @llvm.minnum.f32(
 float test_min_float(float p0, float p1) { return min(p0, p1); }
-// CHECK: define noundef <2 x float> @
+// CHECK-LABEL: define noundef <2 x float> @_Z15test_min_float2
 // CHECK: call <2 x float> @llvm.minnum.v2f32
 float2 test_min_float2(float2 p0, float2 p1) { return min(p0, p1); }
-// CHECK: define noundef <3 x float> @
+// CHECK-LABEL: define noundef <3 x float> @_Z15test_min_float3
 // CHECK: call <3 x float> @llvm.minnum.v3f32
 float3 test_min_float3(float3 p0, float3 p1) { return min(p0, p1); }
-// CHECK: define noundef <4 x float> @
+// CHECK-LABEL: define noundef <4 x float> @_Z15test_min_float4
 // CHECK: call <4 x float> @llvm.minnum.v4f32
 float4 test_min_float4(float4 p0, float4 p1) { return min(p0, p1); }
 
-// CHECK: define noundef double @
+// CHECK-LABEL: define noundef double @_Z15test_min_double
 // CHECK: call double @llvm.minnum.f64(
 double test_min_double(double p0, double p1) { return min(p0, p1); }
-// CHECK: define noundef <2 x double> @
+// CHECK-LABEL: define noundef <2 x double> @_Z16test_min_double2
 // CHECK: call <2 x double> @llvm.minnum.v2f64
 double2 test_min_double2(double2 p0, double2 p1) { return min(p0, p1); }
-// CHECK: define noundef <3 x double> @
+// CHECK-LABEL: define noundef <3 x double> @_Z16test_min_double3
 // CHECK: call <3 x double> @llvm.minnum.v3f64
 double3 test_min_double3(double3 p0, double3 p1) { return min(p0, p1); }
-// CHECK: define noundef <4 x double> @
+// CHECK-LABEL: define noundef <4 x double> @_Z16test_min_double4
 // CHECK: call <4 x double> @llvm.minnum.v4f64
 double4 test_min_double4(double4 p0, double4 p1) { return min(p0, p1); }
diff --git a/clang/test/CodeGenHLSL/builtins/pow.hlsl b/clang/test/CodeGenHLSL/builtins/pow.hlsl
index 9a2264e740751c..4e184807633438 100644
--- a/clang/test/CodeGenHLSL/builtins/pow.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/pow.hlsl
@@ -1,41 +1,40 @@
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
-// RUN:   --check-prefixes=CHECK,NATIVE_HALF
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF: define noundef half @
+// NATIVE_HALF-LABEL: define noundef half @_Z13test_pow_half
 // NATIVE_HALF: call half @llvm.pow.f16(
-// NO_HALF: define noundef float @"?test_pow_half
+// NO_HALF-LABEL: define noundef float @_Z13test_pow_half
 // NO_HALF: call float @llvm.pow.f32(
 half test_pow_half(half p0, half p1) { return pow(p0, p1); }
-// NATIVE_HALF: define noundef <2 x half> @"?test_pow_half2
+// NATIVE_HALF-LABEL: define noundef <2 x half> @_Z14test_pow_half2
 // NATIVE_HALF: call <2 x half> @llvm.pow.v2f16
-// NO_HALF: define noundef <2 x float> @"?test_pow_half2
+// NO_HALF-LABEL: define noundef <2 x float> @_Z14test_pow_half2
 // NO_HALF: call <2 x float> @llvm.pow.v2f32(
 half2 test_pow_half2(half2 p0, half2 p1) { return pow(p0, p1); }
-// NATIVE_HALF: define noundef <3 x half> @"?test_pow_half3
+// NATIVE_HALF-LABEL: define noundef <3 x half> @_Z14test_pow_half3
 // NATIVE_HALF: call <3 x half> @llvm.pow.v3f16
-// NO_HALF: define noundef <3 x float> @"?test_pow_half3
+// NO_HALF-LABEL: define noundef <3 x float> @_Z14test_pow_half3
 // NO_HALF: call <3 x float> @llvm.pow.v3f32(
 half3 test_pow_half3(half3 p0, half3 p1) { return pow(p0, p1); }
-// NATIVE_HALF: define noundef <4 x half> @"?test_pow_half4
+// NATIVE_HALF-LABEL: define noundef <4 x half> @_Z14test_pow_half4
 // NATIVE_HALF: call <4 x half> @llvm.pow.v4f16
-// NO_HALF: define noundef <4 x float> @"?test_pow_half4
+// NO_HALF-LABEL: define noundef <4 x float> @_Z14test_pow_half4
 // NO_HALF: call <4 x float> @llvm.pow.v4f32(
 half4 test_pow_half4(half4 p0, half4 p1) { return pow(p0, p1); }
 
-// CHECK: define noundef float @"?test_pow_float
+// CHECK-LABEL: define noundef float @_Z14test_pow_float
 // CHECK: call float @llvm.pow.f32(
 float test_pow_float(float p0, float p1) { return pow(p0, p1); }
-// CHECK: define noundef <2 x float> @"?test_pow_float2
+// CHECK-LABEL: define noundef <2 x float> @_Z15test_pow_float2
 // CHECK: call <2 x float> @llvm.pow.v2f32
 float2 test_pow_float2(float2 p0, float2 p1) { return pow(p0, p1); }
-// CHECK: define noundef <3 x float> @"?test_pow_float3
+// CHECK-LABEL: define noundef <3 x float> @_Z15test_pow_float3
 // CHECK: call <3 x float> @llvm.pow.v3f32
 float3 test_pow_float3(float3 p0, float3 p1) { return pow(p0, p1); }
-// CHECK: define noundef <4 x float> @"?test_pow_float4
+// CHECK-LABEL: define noundef <4 x float> @_Z15test_pow_float4
 // CHECK: call <4 x float> @llvm.pow.v4f32
 float4 test_pow_float4(float4 p0, float4 p1) { return pow(p0, p1); }
diff --git a/clang/test/CodeGenHLSL/builtins/round.hlsl b/clang/test/CodeGenHLSL/builtins/round.hlsl
index 33d761dbdfbeae..6da63a394a8fdc 100644
--- a/clang/test/CodeGenHLSL/builtins/round.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/round.hlsl
@@ -1,53 +1,52 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
-// RUN:   --check-prefixes=CHECK,NATIVE_HALF
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF: define noundef half @
+// NATIVE_HALF-LABEL: define noundef half @_Z15test_round_half
 // NATIVE_HALF: %elt.roundeven = call half @llvm.roundeven.f16(
 // NATIVE_HALF: ret half %elt.roundeven
-// NO_HALF: define noundef float @"?test_round_half@@YA$halff@$halff@@Z"(
+// NO_HALF-LABEL: define noundef float @_Z15test_round_half
 // NO_HALF: %elt.roundeven = call float @llvm.roundeven.f32(
 // NO_HALF: ret float %elt.roundeven
 half test_round_half(half p0) { return round(p0); }
-// NATIVE_HALF: define noundef <2 x half> @
+// NATIVE_HALF-LABEL: define noundef <2 x half> @_Z16test_round_half2
 // NATIVE_HALF: %elt.roundeven = call <2 x half> @llvm.roundeven.v2f16
 // NATIVE_HALF: ret <2 x half> %elt.roundeven
-// NO_HALF: define noundef <2 x float> @
+// NO_HALF-LABEL: define noundef <2 x float> @_Z16test_round_half2
 // NO_HALF: %elt.roundeven = call <2 x float> @llvm.roundeven.v2f32(
 // NO_HALF: ret <2 x float> %elt.roundeven
 half2 test_round_half2(half2 p0) { return round(p0); }
-// NATIVE_HALF: define noundef <3 x half> @
+// NATIVE_HALF-LABEL: define noundef <3 x half> @_Z16test_round_half3
 // NATIVE_HALF: %elt.roundeven = call <3 x half> @llvm.roundeven.v3f16
 // NATIVE_HALF: ret <3 x half> %elt.roundeven
-// NO_HALF: define noundef <3 x float> @
+// NO_HALF-LABEL: define noundef <3 x float> @_Z16test_round_half3
 // NO_HALF: %elt.roundeven = call <3 x float> @llvm.roundeven.v3f32(
 // NO_HALF: ret <3 x float> %elt.roundeven
 half3 test_round_half3(half3 p0) { return round(p0); }
-// NATIVE_HALF: define noundef <4 x half> @
+// NATIVE_HALF-LABEL: define noundef <4 x half> @_Z16test_round_half4
 // NATIVE_HALF: %elt.roundeven = call <4 x half> @llvm.roundeven.v4f16
 // NATIVE_HALF: ret <4 x half> %elt.roundeven
-// NO_HALF: define noundef <4 x float> @
+// NO_HALF-LABEL: define noundef <4 x float> @_Z16test_round_half4
 // NO_HALF: %elt.roundeven = call <4 x float> @llvm.roundeven.v4f32(
 // NO_HALF: ret <4 x float> %elt.roundeven
 half4 test_round_half4(half4 p0) { return round(p0); }
 
-// CHECK: define noundef float @
+// CHECK-LABEL: define noundef float @_Z16test_round_float
 // CHECK: %elt.roundeven = call float @llvm.roundeven.f32(
 // CHECK: ret float %elt.roundeven
 float test_round_float(float p0) { return round(p0); }
-// CHECK: define noundef <2 x float> @
+// CHECK-LABEL: define noundef <2 x float> @_Z17test_round_float2
 // CHECK: %elt.roundeven = call <2 x float> @llvm.roundeven.v2f32
 // CHECK: ret <2 x float> %elt.roundeven
 float2 test_round_float2(float2 p0) { return round(p0); }
-// CHECK: define noundef <3 x float> @
+// CHECK-LABEL: define noundef <3 x float> @_Z17test_round_float3
 // CHECK: %elt.roundeven = call <3 x float> @llvm.roundeven.v3f32
 // CHECK: ret <3 x float> %elt.roundeven
 float3 test_round_float3(float3 p0) { return round(p0); }
-// CHECK: define noundef <4 x float> @
+// CHECK-LABEL: define noundef <4 x float> @_Z17test_round_float4
 // CHECK: %elt.roundeven = call <4 x float> @llvm.roundeven.v4f32
 // CHECK: ret <4 x float> %elt.roundeven
 float4 test_round_float4(float4 p0) { return round(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/saturate.hlsl b/clang/test/CodeGenHLSL/builtins/saturate.hlsl
index 65a3cd74621cc0..c221f6e0f2c36f 100644
--- a/clang/test/CodeGenHLSL/builtins/saturate.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/saturate.hlsl
@@ -1,95 +1,60 @@
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
-// RUN:   --check-prefixes=CHECK,NATIVE_HALF
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF -Dtar=dx
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF -Dtar=dx
 
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
-// RUN:   --check-prefixes=SPIRV,SPIRV_HALF
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-library %s \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
-// RUN:   --check-prefixes=SPIRV,SPIRV_NO_HALF
+// RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-library %s \
+// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF -Dtar=spv
+// RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-library %s \
+// RUN:  -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF -Dtar=spv
 
-// NATIVE_HALF: define noundef half @
-// NATIVE_HALF: call half @llvm.dx.saturate.f16(
-// NO_HALF: define noundef float @"?test_saturate_half
-// NO_HALF: call float @llvm.dx.saturate.f32(
-// SPIRV_HALF: define spir_func noundef half @_Z18test_saturate_halfDh(half
-// SPIRV_HALF: call half @llvm.spv.saturate.f16(half
-// SPIRV_NO_HALF: define spir_func noundef float @_Z18test_saturate_halfDh(float
-// SPIRV_NO_HALF: call float @llvm.spv.saturate.f32(float
+// NATIVE_HALF-LABEL: define{{.*}} half @_Z18test_saturate_halfDh
+// NATIVE_HALF: call half @llvm.[[tar]].saturate.f16(
+// NO_HALF-LABEL: define{{.*}} float @_Z18test_saturate_halfDh
+// NO_HALF: call float @llvm.[[tar]].saturate.f32(
 half test_saturate_half(half p0) { return saturate(p0); }
-// NATIVE_HALF: define noundef <2 x half> @
-// NATIVE_HALF: call <2 x half> @llvm.dx.saturate.v2f16
-// NO_HALF: define noundef <2 x float> @"?test_saturate_half2
-// NO_HALF: call <2 x float> @llvm.dx.saturate.v2f32(
-// SPIRV_HALF: define spir_func noundef <2 x half> @_Z19test_saturate_half2Dv2_Dh(
-// SPIRV_HALF: call <2 x half> @llvm.spv.saturate.v2f16(<2 x half>
-// SPIRV_NO_HALF: define spir_func noundef <2 x float> @_Z19test_saturate_half2Dv2_Dh(<2 x float>
-// SPIRV_NO_HALF: call <2 x float> @llvm.spv.saturate.v2f32(<2 x float>
+// NATIVE_HALF-LABEL: define{{.*}} <2 x half> @_Z19test_saturate_half2Dv2_Dh
+// NATIVE_HALF: call <2 x half> @llvm.[[tar]].saturate.v2f16
+// NO_HALF-LABEL: define{{.*}} <2 x float> @_Z19test_saturate_half2Dv2_Dh
+// NO_HALF: call <2 x float> @llvm.[[tar]].saturate.v2f32(
 half2 test_saturate_half2(half2 p0) { return saturate(p0); }
-// NATIVE_HALF: define noundef <3 x half> @
-// NATIVE_HALF: call <3 x half> @llvm.dx.saturate.v3f16
-// NO_HALF: define noundef <3 x float> @"?test_saturate_half3
-// NO_HALF: call <3 x float> @llvm.dx.saturate.v3f32(
-// SPIRV_HALF: define spir_func noundef <3 x half> @_Z19test_saturate_half3Dv3_Dh(
-// SPIRV_HALF: call <3 x half> @llvm.spv.saturate.v3f16(<3 x half>
-// SPIRV_NO_HALF: define spir_func noundef <3 x float> @_Z19test_saturate_half3Dv3_Dh(<3 x float>
-// SPIRV_NO_HALF: call <3 x float> @llvm.spv.saturate.v3f32(<3 x float>
+// NATIVE_HALF-LABEL: define{{.*}} <3 x half> @_Z19test_saturate_half3Dv3_Dh(
+// NATIVE_HALF: call <3 x half> @llvm.[[tar]].saturate.v3f16
+// NO_HALF-LABEL: define{{.*}} <3 x float> @_Z19test_saturate_half3Dv3_Dh(<3 x float>
+// NO_HALF: call <3 x float> @llvm.[[tar]].saturate.v3f32(
 half3 test_saturate_half3(half3 p0) { return saturate(p0); }
-// NATIVE_HALF: define noundef <4 x half> @
-// NATIVE_HALF: call <4 x half> @llvm.dx.saturate.v4f16
-// NO_HALF: define noundef <4 x float> @"?test_saturate_half4
-// NO_HALF: call <4 x float> @llvm.dx.saturate.v4f32(
-// SPIRV_HALF: define spir_func noundef <4 x half> @_Z19test_saturate_half4Dv4_Dh(
-// SPIRV_HALF: call <4 x half> @llvm.spv.saturate.v4f16(<4 x half>
-// SPIRV_NO_HALF: define spir_func noundef <4 x float> @_Z19test_saturate_half4Dv4_Dh(<4 x float>
-// SPIRV_NO_HALF: call <4 x float> @llvm.spv.saturate.v4f32(<4 x float>
+// NATIVE_HALF-LABEL: define{{.*}} <4 x half> @_Z19test_saturate_half4Dv4_Dh(
+// NATIVE_HALF: call <4 x half> @llvm.[[tar]].saturate.v4f16
+// NO_HALF-LABEL: define{{.*}} <4 x float> @_Z19test_saturate_half4Dv4_Dh(<4 x float>
+// NO_HALF: call <4 x float> @llvm.[[tar]].saturate.v4f32(
 half4 test_saturate_half4(half4 p0) { return saturate(p0); }
 
-// CHECK: define noundef float @"?test_saturate_float
-// CHECK: call float @llvm.dx.saturate.f32(
-// SPIRV: define spir_func noundef float @_Z19test_saturate_floatf(float
-// SPIRV: call float @llvm.spv.saturate.f32(float
+// CHECK-LABEL: define{{.*}} float @_Z19test_saturate_floatf(
+// CHECK: call float @llvm.[[tar]].saturate.f32(
 float test_saturate_float(float p0) { return saturate(p0); }
-// CHECK: define noundef <2 x float> @"?test_saturate_float2
-// CHECK: call <2 x float> @llvm.dx.saturate.v2f32
-// SPIRV: define spir_func noundef <2 x float> @_Z20test_saturate_float2Dv2_f(<2 x float>
-// SPIRV: call <2 x float> @llvm.spv.saturate.v2f32(<2 x float>
+// CHECK-LABEL: define{{.*}} <2 x float> @_Z20test_saturate_float2Dv2_f(<2 x float>
+// CHECK: call <2 x float> @llvm.[[tar]].saturate.v2f32
 float2 test_saturate_float2(float2 p0) { return saturate(p0); }
-// CHECK: define noundef <3 x float> @"?test_saturate_float3
-// CHECK: call <3 x float> @llvm.dx.saturate.v3f32
-// SPIRV: define spir_func noundef <3 x float> @_Z20test_saturate_float3Dv3_f(<3 x float>
-// SPIRV: call <3 x float> @llvm.spv.saturate.v3f32(<3 x float>
+// CHECK-LABEL: define{{.*}} <3 x float> @_Z20test_saturate_float3Dv3_f(<3 x float>
+// CHECK: call <3 x float> @llvm.[[tar]].saturate.v3f32
 float3 test_saturate_float3(float3 p0) { return saturate(p0); }
-// CHECK: define noundef <4 x float> @"?test_saturate_float4
-// CHECK: call <4 x float> @llvm.dx.saturate.v4f32
-// SPIRV: define spir_func noundef <4 x float> @_Z20test_saturate_float4Dv4_f(<4 x float>
-// SPIRV: call <4 x float> @llvm.spv.saturate.v4f32(<4 x float>
+// CHECK-LABEL: define{{.*}} <4 x float> @_Z20test_saturate_float4Dv4_f(<4 x float>
+// CHECK: call <4 x float> @llvm.[[tar]].saturate.v4f32
 float4 test_saturate_float4(float4 p0) { return saturate(p0); }
 
-// CHECK: define noundef double @
-// CHECK: call double @llvm.dx.saturate.f64(
-// SPIRV: define spir_func noundef double @_Z20test_saturate_doubled(double
-// SPIRV: call double @llvm.spv.saturate.f64(double
+// CHECK-LABEL: define{{.*}} double @_Z20test_saturate_doubled(double
+// CHECK: call double @llvm.[[tar]].saturate.f64(
 double test_saturate_double(double p0) { return saturate(p0); }
-// CHECK: define noundef <2 x double> @
-// CHECK: call <2 x double> @llvm.dx.saturate.v2f64
-// SPIRV: define spir_func noundef <2 x double> @_Z21test_saturate_double2Dv2_d(<2 x double>
-// SPIRV: call <2 x double> @llvm.spv.saturate.v2f64(<2 x double>
+// CHECK-LABEL: define{{.*}} <2 x double> @_Z21test_saturate_double2Dv2_d(<2 x double>
+// CHECK: call <2 x double> @llvm.[[tar]].saturate.v2f64
 double2 test_saturate_double2(double2 p0) { return saturate(p0); }
-// CHECK: define noundef <3 x double> @
-// CHECK: call <3 x double> @llvm.dx.saturate.v3f64
-// SPIRV: define spir_func noundef <3 x double> @_Z21test_saturate_double3Dv3_d(<3 x double>
-// SPIRV: call <3 x double> @llvm.spv.saturate.v3f64(<3 x double>
+// CHECK-LABEL: define{{.*}} <3 x double> @_Z21test_saturate_double3Dv3_d(<3 x double>
+// CHECK: call <3 x double> @llvm.[[tar]].saturate.v3f64
 double3 test_saturate_double3(double3 p0) { return saturate(p0); }
-// CHECK: define noundef <4 x double> @
-// CHECK: call <4 x double> @llvm.dx.saturate.v4f64
-// SPIRV: define spir_func noundef <4 x double> @_Z21test_saturate_double4Dv4_d(<4 x double>
-// SPIRV: call <4 x double> @llvm.spv.saturate.v4f64(<4 x double>
+// CHECK-LABEL: define{{.*}} <4 x double> @_Z21test_saturate_double4Dv4_d(<4 x double>
+// CHECK: call <4 x double> @llvm.[[tar]].saturate.v4f64
 double4 test_saturate_double4(double4 p0) { return saturate(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/sin.hlsl b/clang/test/CodeGenHLSL/builtins/sin.hlsl
index 83e8a5be39d069..9f7fa5043bdc7d 100644
--- a/clang/test/CodeGenHLSL/builtins/sin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/sin.hlsl
@@ -1,41 +1,40 @@
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes  -o - | FileCheck %s \
-// RUN:   --check-prefixes=CHECK,NATIVE_HALF
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF: define noundef half @
+// NATIVE_HALF-LABEL: define noundef half @_Z13test_sin_half
 // NATIVE_HALF: call half @llvm.sin.f16(
-// NO_HALF: define noundef float @"?test_sin_half@@YA$halff@$halff@@Z"(
+// NO_HALF-LABEL: define noundef float @_Z13test_sin_half
 // NO_HALF: call float @llvm.sin.f32(
 half test_sin_half(half p0) { return sin(p0); }
-// NATIVE_HALF: define noundef <2 x half> @
+// NATIVE_HALF-LABEL: define noundef <2 x half> @_Z14test_sin_half2
 // NATIVE_HALF: call <2 x half> @llvm.sin.v2f16
-// NO_HALF: define noundef <2 x float> @"?test_sin_half2
+// NO_HALF-LABEL: define noundef <2 x float> @_Z14test_sin_half2
 // NO_HALF: call <2 x float> @llvm.sin.v2f32(
 half2 test_sin_half2(half2 p0) { return sin(p0); }
-// NATIVE_HALF: define noundef <3 x half> @
+// NATIVE_HALF-LABEL: define noundef <3 x half> @_Z14test_sin_half3
 // NATIVE_HALF: call <3 x half> @llvm.sin.v3f16
-// NO_HALF: define noundef <3 x float> @"?test_sin_half3
+// NO_HALF-LABEL: define noundef <3 x float> @_Z14test_sin_half3
 // NO_HALF: call <3 x float> @llvm.sin.v3f32(
 half3 test_sin_half3(half3 p0) { return sin(p0); }
-// NATIVE_HALF: define noundef <4 x half> @
+// NATIVE_HALF-LABEL: define noundef <4 x half> @_Z14test_sin_half4
 // NATIVE_HALF: call <4 x half> @llvm.sin.v4f16
-// NO_HALF: define noundef <4 x float> @"?test_sin_half4
+// NO_HALF-LABEL: define noundef <4 x float> @_Z14test_sin_half4
 // NO_HALF: call <4 x float> @llvm.sin.v4f32(
 half4 test_sin_half4(half4 p0) { return sin(p0); }
 
-// CHECK: define noundef float @
+// CHECK-LABEL: define noundef float @_Z14test_sin_float
 // CHECK: call float @llvm.sin.f32(
 float test_sin_float(float p0) { return sin(p0); }
-// CHECK: define noundef <2 x float> @
+// CHECK-LABEL: define noundef <2 x float> @_Z15test_sin_float2
 // CHECK: call <2 x float> @llvm.sin.v2f32
 float2 test_sin_float2(float2 p0) { return sin(p0); }
-// CHECK: define noundef <3 x float> @
+// CHECK-LABEL: define noundef <3 x float> @_Z15test_sin_float3
 // CHECK: call <3 x float> @llvm.sin.v3f32
 float3 test_sin_float3(float3 p0) { return sin(p0); }
-// CHECK: define noundef <4 x float> @
+// CHECK-LABEL: define noundef <4 x float> @_Z15test_sin_float4
 // CHECK: call <4 x float> @llvm.sin.v4f32
 float4 test_sin_float4(float4 p0) { return sin(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/sqrt.hlsl b/clang/test/CodeGenHLSL/builtins/sqrt.hlsl
index adbbf69a8e0685..63454cea3fe6fb 100644
--- a/clang/test/CodeGenHLSL/builtins/sqrt.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/sqrt.hlsl
@@ -1,53 +1,52 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
-// RUN:   --check-prefixes=CHECK,NATIVE_HALF
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF: define noundef half @
+// NATIVE_HALF-LABEL: define noundef half @_Z14test_sqrt_half
 // NATIVE_HALF: %{{.*}} = call half @llvm.sqrt.f16(
 // NATIVE_HALF: ret half %{{.*}}
-// NO_HALF: define noundef float @"?test_sqrt_half@@YA$halff@$halff@@Z"(
+// NO_HALF-LABEL: define noundef float @_Z14test_sqrt_half
 // NO_HALF: %{{.*}} = call float @llvm.sqrt.f32(
 // NO_HALF: ret float %{{.*}}
 half test_sqrt_half(half p0) { return sqrt(p0); }
-// NATIVE_HALF: define noundef <2 x half> @
+// NATIVE_HALF-LABEL: define noundef <2 x half> @_Z15test_sqrt_half2
 // NATIVE_HALF: %{{.*}} = call <2 x half> @llvm.sqrt.v2f16
 // NATIVE_HALF: ret <2 x half> %{{.*}}
-// NO_HALF: define noundef <2 x float> @
+// NO_HALF-LABEL: define noundef <2 x float> @_Z15test_sqrt_half2
 // NO_HALF: %{{.*}} = call <2 x float> @llvm.sqrt.v2f32(
 // NO_HALF: ret <2 x float> %{{.*}}
 half2 test_sqrt_half2(half2 p0) { return sqrt(p0); }
-// NATIVE_HALF: define noundef <3 x half> @
+// NATIVE_HALF-LABEL: define noundef <3 x half> @_Z15test_sqrt_half3
 // NATIVE_HALF: %{{.*}} = call <3 x half> @llvm.sqrt.v3f16
 // NATIVE_HALF: ret <3 x half> %{{.*}}
-// NO_HALF: define noundef <3 x float> @
+// NO_HALF-LABEL: define noundef <3 x float> @_Z15test_sqrt_half3
 // NO_HALF: %{{.*}} = call <3 x float> @llvm.sqrt.v3f32(
 // NO_HALF: ret <3 x float> %{{.*}}
 half3 test_sqrt_half3(half3 p0) { return sqrt(p0); }
-// NATIVE_HALF: define noundef <4 x half> @
+// NATIVE_HALF-LABEL: define noundef <4 x half> @_Z15test_sqrt_half4
 // NATIVE_HALF: %{{.*}} = call <4 x half> @llvm.sqrt.v4f16
 // NATIVE_HALF: ret <4 x half> %{{.*}}
-// NO_HALF: define noundef <4 x float> @
+// NO_HALF-LABEL: define noundef <4 x float> @_Z15test_sqrt_half4
 // NO_HALF: %{{.*}} = call <4 x float> @llvm.sqrt.v4f32(
 // NO_HALF: ret <4 x float> %{{.*}}
 half4 test_sqrt_half4(half4 p0) { return sqrt(p0); }
 
-// CHECK: define noundef float @
+// CHECK-LABEL: define noundef float @_Z15test_sqrt_float
 // CHECK: %{{.*}} = call float @llvm.sqrt.f32(
 // CHECK: ret float %{{.*}}
 float test_sqrt_float(float p0) { return sqrt(p0); }
-// CHECK: define noundef <2 x float> @
+// CHECK-LABEL: define noundef <2 x float> @_Z16test_sqrt_float2
 // CHECK: %{{.*}} = call <2 x float> @llvm.sqrt.v2f32
 // CHECK: ret <2 x float> %{{.*}}
 float2 test_sqrt_float2(float2 p0) { return sqrt(p0); }
-// CHECK: define noundef <3 x float> @
+// CHECK-LABEL: define noundef <3 x float> @_Z16test_sqrt_float3
 // CHECK: %{{.*}} = call <3 x float> @llvm.sqrt.v3f32
 // CHECK: ret <3 x float> %{{.*}}
 float3 test_sqrt_float3(float3 p0) { return sqrt(p0); }
-// CHECK: define noundef <4 x float> @
+// CHECK-LABEL: define noundef <4 x float> @_Z16test_sqrt_float4
 // CHECK: %{{.*}} = call <4 x float> @llvm.sqrt.v4f32
 // CHECK: ret <4 x float> %{{.*}}
 float4 test_sqrt_float4(float4 p0) { return sqrt(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/trunc.hlsl b/clang/test/CodeGenHLSL/builtins/trunc.hlsl
index 40b71f45a9ccb2..3da12c88aa7fec 100644
--- a/clang/test/CodeGenHLSL/builtins/trunc.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/trunc.hlsl
@@ -1,47 +1,46 @@
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
-// RUN:   --check-prefixes=CHECK,NATIVE_HALF
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
-
-// NATIVE_HALF: define noundef half @"?test_trunc_half
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:  -emit-llvm -disable-llvm-passes -o - | \
+// RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
+
+// NATIVE_HALF-LABEL: define noundef half @_Z15test_trunc_half
 // NATIVE_HALF: call half @llvm.trunc.f16(
-// NO_HALF: define noundef float @"?test_trunc_half
+// NO_HALF-LABEL: define noundef float @_Z15test_trunc_half
 // NO_HALF: call float @llvm.trunc.f32(
 half test_trunc_half(half p0) { return trunc(p0); }
 
-// NATIVE_HALF: define noundef <2 x half> @"?test_trunc_half2
+// NATIVE_HALF-LABEL: define noundef <2 x half> @_Z16test_trunc_half2
 // NATIVE_HALF: call <2 x half> @llvm.trunc.v2f16
-// NO_HALF: define noundef <2 x float> @"?test_trunc_half2
+// NO_HALF-LABEL: define noundef <2 x float> @_Z16test_trunc_half2
 // NO_HALF: call <2 x float> @llvm.trunc.v2f32(
 half2 test_trunc_half2(half2 p0) { return trunc(p0); }
 
-// NATIVE_HALF: define noundef <3 x half> @"?test_trunc_half3
+// NATIVE_HALF-LABEL: define noundef <3 x half> @_Z16test_trunc_half3
 // NATIVE_HALF: call <3 x half> @llvm.trunc.v3f16
-// NO_HALF: define noundef <3 x float> @"?test_trunc_half3
+// NO_HALF-LABEL: define noundef <3 x float> @_Z16test_trunc_half3
 // NO_HALF: call <3 x float> @llvm.trunc.v3f32(
 half3 test_trunc_half3(half3 p0) { return trunc(p0); }
 
-// NATIVE_HALF: define noundef <4 x half> @"?test_trunc_half4
+// NATIVE_HALF-LABEL: define noundef <4 x half> @_Z16test_trunc_half4
 // NATIVE_HALF: call <4 x half> @llvm.trunc.v4f16
-// NO_HALF: define noundef <4 x float> @"?test_trunc_half4
+// NO_HALF-LABEL: define noundef <4 x float> @_Z16test_trunc_half4
 // NO_HALF: call <4 x float> @llvm.trunc.v4f32(
 half4 test_trunc_half4(half4 p0) { return trunc(p0); }
 
-// CHECK: define noundef float @"?test_trunc_float
+// CHECK-LABEL: define noundef float @_Z16test_trunc_float
 // CHECK: call float @llvm.trunc.f32(
 float test_trunc_float(float p0) { return trunc(p0); }
 
-// CHECK: define noundef <2 x float> @"?test_trunc_float2
+// CHECK-LABEL: define noundef <2 x float> @_Z17test_trunc_float2
 // CHECK: call <2 x float> @llvm.trunc.v2f32
 float2 test_trunc_float2(float2 p0) { return trunc(p0); }
 
-// CHECK: define noundef <3 x float> @"?test_trunc_float3
+// CHECK-LABEL: define noundef <3 x float> @_Z17test_trunc_float3
 // CHECK: call <3 x float> @llvm.trunc.v3f32
 float3 test_trunc_float3(float3 p0) { return trunc(p0); }
 
-// CHECK: define noundef <4 x float> @"?test_trunc_float4
+// CHECK-LABEL: define noundef <4 x float> @_Z17test_trunc_float4
 // CHECK: call <4 x float> @llvm.trunc.v4f32
 float4 test_trunc_float4(float4 p0) { return trunc(p0); }
diff --git a/clang/test/CodeGenHLSL/export.hlsl b/clang/test/CodeGenHLSL/export.hlsl
index 63f9f9066f9277..770618ff2e0703 100644
--- a/clang/test/CodeGenHLSL/export.hlsl
+++ b/clang/test/CodeGenHLSL/export.hlsl
@@ -1,20 +1,19 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s \
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
-// CHECK: define void @"?f1@@YAXXZ"() [[Attr:\#[0-9]+]]
+// CHECK: define void @_Z2f1v() [[Attr:\#[0-9]+]]
 export void f1() {
 }
 
-// CHECK: define void @"?f2@MyNamespace@@YAXXZ"() [[Attr]]
+// CHECK: define void @_ZN11MyNamespace2f2Ev() [[Attr]]
 namespace MyNamespace {
   export void f2() {
   }
 }
 
 export {
-// CHECK: define void @"?f3@@YAXXZ"() [[Attr]]
-// CHECK: define void @"?f4@@YAXXZ"() [[Attr]]
+// CHECK: define void @_Z2f3v() [[Attr]]
+// CHECK: define void @_Z2f4v() [[Attr]]
     void f3() {}
     void f4() {}
 }
diff --git a/clang/test/CodeGenHLSL/float3.hlsl b/clang/test/CodeGenHLSL/float3.hlsl
index 63379349d9bd76..767720b049152d 100644
--- a/clang/test/CodeGenHLSL/float3.hlsl
+++ b/clang/test/CodeGenHLSL/float3.hlsl
@@ -3,7 +3,7 @@
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
 // Make sure float3 is not changed into float4.
-// CHECK:<3 x float> @"?foo@@YAT?$__vector@M$02@__clang@@T12@@Z"(<3 x float> noundef %[[PARAM:[0-9a-zA-Z]+]])
+// CHECK:<3 x float> @_Z3fooDv3_f(<3 x float> noundef %[[PARAM:[0-9a-zA-Z]+]])
 // CHECK:%[[A_ADDR:.+]] = alloca <3 x float>, align 16
 // CHECK-NEXT:store <3 x float> %[[PARAM]], ptr %[[A_ADDR]], align 16
 // CHECK-NEXT:%[[V:[0-9]+]] = load <3 x float>, ptr %[[A_ADDR]], align 16
diff --git a/clang/test/CodeGenHLSL/group_shared.hlsl b/clang/test/CodeGenHLSL/group_shared.hlsl
index 48d14b2506fbc7..4b2e2beba4f12b 100644
--- a/clang/test/CodeGenHLSL/group_shared.hlsl
+++ b/clang/test/CodeGenHLSL/group_shared.hlsl
@@ -4,7 +4,7 @@
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
 // Make sure groupshared translated into address space 3.
-// CHECK:@"?a@@3PAMA" = addrspace(3) global [10 x float]
+// CHECK:@a = addrspace(3) global [10 x float]
 
  groupshared float a[10];
 
diff --git a/clang/test/CodeGenHLSL/half.hlsl b/clang/test/CodeGenHLSL/half.hlsl
index e83a6fc715b8ae..08df6f31fd12fd 100644
--- a/clang/test/CodeGenHLSL/half.hlsl
+++ b/clang/test/CodeGenHLSL/half.hlsl
@@ -8,12 +8,12 @@
 
 
 // Make sure use float when not enable-16bit-types.
-// FLOAT:define {{.*}}float @"?foo@@YA$halff@$halff@0@Z"(float{{[^,]+}}, float{{[^,)]+}})
+// FLOAT:define {{.*}}float @_Z3fooDhDh(float{{[^,]+}}, float{{[^,)]+}})
 // FLOAT-NOT:half
 // FLOAT:ret float %
 
 // Make sure use half when enable-16bit-types.
-// HALF:define {{.*}}half @"?foo@@YA$f16@$f16@0@Z"(half{{[^,]+}}, half{{[^,)]+}})
+// HALF:define {{.*}}half @_Z3fooDhDh(half{{[^,]+}}, half{{[^,)]+}})
 // HALF-NOT:float
 // HALF:ret half %
 half foo(half a, half b) {
diff --git a/clang/test/CodeGenHLSL/implicit-norecurse-attrib.hlsl b/clang/test/CodeGenHLSL/implicit-norecurse-attrib.hlsl
index f72fe059cb5763..5efecc1489afca 100644
--- a/clang/test/CodeGenHLSL/implicit-norecurse-attrib.hlsl
+++ b/clang/test/CodeGenHLSL/implicit-norecurse-attrib.hlsl
@@ -12,7 +12,7 @@ struct Node {
 };
 
 // CHECK: Function Attrs:{{.*}}norecurse
-// CHECK: define noundef i32 @"?Find@@YAIY0GE@UNode@@I@Z"(ptr noundef byval([100 x %struct.Node]) align 4 %SortedTree, i32 noundef %key) [[IntAttr:\#[0-9]+]]
+// CHECK: define noundef i32 @_Z4FindA100_4Nodej(ptr noundef byval([100 x %struct.Node]) align 4 %SortedTree, i32 noundef %key) [[IntAttr:\#[0-9]+]]
 // CHECK: ret i32
 // Find and return value corresponding to key in the SortedTree
 uint Find(Node SortedTree[MAX], uint key) {
@@ -31,7 +31,7 @@ uint Find(Node SortedTree[MAX], uint key) {
 }
 
 // CHECK: Function Attrs:{{.*}}norecurse
-// CHECK: define noundef i1 @"?InitTree@@YA_NY0GE@UNode@@V?$RWBuffer@T?$__vector@I$03@__clang@@@hlsl@@I@Z"(ptr noundef byval([100 x %struct.Node]) align 4 %tree, ptr noundef byval(%"class.hlsl::RWBuffer") align 16 %encodedTree, i32 noundef %maxDepth) [[ExtAttr:\#[0-9]+]]
+// CHECK: define noundef i1 @_Z8InitTreeA100_4NodeN4hlsl8RWBufferIDv4_jEEj(ptr noundef byval([100 x %struct.Node]) align 4 %tree, ptr noundef byval(%"class.hlsl::RWBuffer") align 16 %encodedTree, i32 noundef %maxDepth) [[ExtAttr:\#[0-9]+]]
 // CHECK: ret i1
 // Initialize tree with given buffer
 // Imagine the inout works
@@ -52,7 +52,7 @@ RWBuffer<uint4> gTree;
 
 // Mangled entry points are internal
 // CHECK: Function Attrs:{{.*}}norecurse
-// CHECK: define internal void @"?main@@YAXI@Z"(i32 noundef %GI) [[IntAttr]]
+// CHECK: define internal void @_Z4mainj(i32 noundef %GI) [[IntAttr]]
 // CHECK: ret void
 
 // Canonical entry points are external and shader attributed
@@ -71,7 +71,7 @@ void main(uint GI : SV_GroupIndex) {
 
 // Mangled entry points are internal
 // CHECK: Function Attrs:{{.*}}norecurse
-// CHECK: define internal void @"?defaultMain@@YAXXZ"() [[IntAttr]]
+// CHECK: define internal void @_Z11defaultMainv() [[IntAttr]]
 // CHECK: ret void
 
 // Canonical entry points are external and shader attributed
diff --git a/clang/test/CodeGenHLSL/inline-constructors.hlsl b/clang/test/CodeGenHLSL/inline-constructors.hlsl
index 995878a9c0f798..b0d5a783fb3725 100644
--- a/clang/test/CodeGenHLSL/inline-constructors.hlsl
+++ b/clang/test/CodeGenHLSL/inline-constructors.hlsl
@@ -49,7 +49,7 @@ void NionsDay(int hours) {
 // Verify constructor is emitted
 // NOINLINE-NEXT: call void @_GLOBAL__sub_I_inline_constructors.hlsl()
 // NOINLINE-NEXT: %0 = call i32 @llvm.dx.flattened.thread.id.in.group()
-// NOINLINE-NEXT: call void @"?main@@YAXI@Z"(i32 %0)
+// NOINLINE-NEXT: call void @_Z4mainj(i32 %0)
 // Verify inlining leaves only calls to "llvm." intrinsics
 // INLINE-NOT:    call {{[^@]*}} @{{[^l][^l][^v][^m][^\.]}}
 // CHECK:         ret void
@@ -64,7 +64,7 @@ void main(unsigned GI : SV_GroupIndex) {
 // CHECK-NEXT: entry:
 // Verify constructor is emitted
 // NOINLINE-NEXT:   call void @_GLOBAL__sub_I_inline_constructors.hlsl()
-// NOINLINE-NEXT:   call void @"?rainyMain@@YAXXZ"()
+// NOINLINE-NEXT:   call void @_Z9rainyMainv()
 // Verify inlining leaves only calls to "llvm." intrinsics
 // INLINE-NOT:      call {{[^@]*}} @{{[^l][^l][^v][^m][^\.]}}
 // CHECK:           ret void
diff --git a/clang/test/CodeGenHLSL/inline-functions.hlsl b/clang/test/CodeGenHLSL/inline-functions.hlsl
index 7dd905e966e069..fa9c88db26dfc2 100644
--- a/clang/test/CodeGenHLSL/inline-functions.hlsl
+++ b/clang/test/CodeGenHLSL/inline-functions.hlsl
@@ -15,7 +15,7 @@ float nums[MAX];
 
 // Verify that all functions have the alwaysinline attribute
 // NOINLINE: Function Attrs: alwaysinline
-// NOINLINE: define void @"?swap@@YAXY0GE@III@Z"(ptr noundef byval([100 x i32]) align 4 %Buf, i32 noundef %ix1, i32 noundef %ix2) [[IntAttr:\#[0-9]+]]
+// NOINLINE: define void @_Z4swapA100_jjj(ptr noundef byval([100 x i32]) align 4 %Buf, i32 noundef %ix1, i32 noundef %ix2) [[IntAttr:\#[0-9]+]]
 // NOINLINE: ret void
 // Swap the values of Buf at indices ix1 and ix2
 void swap(unsigned Buf[MAX], unsigned ix1, unsigned ix2) {
@@ -25,7 +25,7 @@ void swap(unsigned Buf[MAX], unsigned ix1, unsigned ix2) {
 }
 
 // NOINLINE: Function Attrs: alwaysinline
-// NOINLINE: define void @"?BubbleSort@@YAXY0GE@II@Z"(ptr noundef byval([100 x i32]) align 4 %Buf, i32 noundef %size) [[IntAttr]]
+// NOINLINE: define void @_Z10BubbleSortA100_jj(ptr noundef byval([100 x i32]) align 4 %Buf, i32 noundef %size) [[IntAttr]]
 // NOINLINE: ret void
 // Inefficiently sort Buf in place
 void BubbleSort(unsigned Buf[MAX], unsigned size) {
@@ -43,7 +43,7 @@ void BubbleSort(unsigned Buf[MAX], unsigned size) {
 
 // Note ExtAttr is the inlined export set of attribs
 // CHECK: Function Attrs: alwaysinline
-// CHECK: define noundef i32 @"?RemoveDupes@@YAIY0GE@II@Z"(ptr {{[a-z_ ]*}}noundef byval([100 x i32]) align 4 %Buf, i32 noundef %size) {{[a-z_ ]*}}[[ExtAttr:\#[0-9]+]]
+// CHECK: define noundef i32 @_Z11RemoveDupesA100_jj(ptr {{[a-z_ ]*}}noundef byval([100 x i32]) align 4 %Buf, i32 noundef %size) {{[a-z_ ]*}}[[ExtAttr:\#[0-9]+]]
 // CHECK: ret i32
 // Sort Buf and remove any duplicate values
 // returns the number of values left
@@ -67,7 +67,7 @@ RWBuffer<unsigned> Indices;
 // because it has internal linkage from the start
 // Note main functions get the norecurse attrib, which IntAttr reflects
 // NOINLINE: Function Attrs: alwaysinline
-// NOINLINE: define internal void @"?main@@YAXI@Z"(i32 noundef %GI) [[IntAttr]]
+// NOINLINE: define internal void @_Z4mainj(i32 noundef %GI) [[IntAttr]]
 // NOINLINE: ret void
 
 // The unmangled version is not inlined, EntryAttr reflects that
@@ -94,7 +94,7 @@ void main(unsigned int GI : SV_GroupIndex) {
 // because it has internal linkage from the start
 // Note main functions get the norecurse attrib, which IntAttr reflects
 // NOINLINE: Function Attrs: alwaysinline
-// NOINLINE: define internal void @"?main10@@YAXXZ"() [[IntAttr]]
+// NOINLINE: define internal void @_Z6main10v() [[IntAttr]]
 // NOINLINE: ret void
 
 // The unmangled version is not inlined, EntryAttr reflects that
diff --git a/clang/test/CodeGenHLSL/semantics/GroupIndex-codegen.hlsl b/clang/test/CodeGenHLSL/semantics/GroupIndex-codegen.hlsl
index 7e7ebe930bd96e..ea358c411997d0 100644
--- a/clang/test/CodeGenHLSL/semantics/GroupIndex-codegen.hlsl
+++ b/clang/test/CodeGenHLSL/semantics/GroupIndex-codegen.hlsl
@@ -13,7 +13,7 @@ void main(unsigned GI : SV_GroupIndex) {
 //CHECK: define void @main() #[[#ENTRY_ATTR:]] {
 //CHECK-NEXT: entry:
 //CHECK-NEXT:   %0 = call i32 @llvm.dx.flattened.thread.id.in.group()
-//CHECK-NEXT:   call void @"?main@@YAXI@Z"(i32 %0)
+//CHECK-NEXT:   call void @_Z4mainj(i32 %0)
 //CHECK-NEXT:   ret void
 //CHECK-NEXT: }
 
diff --git a/clang/test/CodeGenHLSL/shift-mask.hlsl b/clang/test/CodeGenHLSL/shift-mask.hlsl
index d046efaf9c1f9c..7b3890ae560d22 100644
--- a/clang/test/CodeGenHLSL/shift-mask.hlsl
+++ b/clang/test/CodeGenHLSL/shift-mask.hlsl
@@ -1,12 +1,11 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s \
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
 int shl32(int V, int S) {
   return V << S;
 }
 
-// CHECK: define noundef i32 @"?shl32{{[@$?.A-Za-z0-9_]+}}"(i32 noundef %V, i32 noundef %S) #0 {
+// CHECK-LABEL: define noundef i32 @_Z5shl32ii(i32 noundef %V, i32 noundef %S) #0 {
 // CHECK-DAG:  %[[Masked:.*]] = and i32 %{{.*}}, 31
 // CHECK-DAG:  %{{.*}} = shl i32 %{{.*}}, %[[Masked]]
 
@@ -14,7 +13,7 @@ int shr32(int V, int S) {
   return V >> S;
 }
 
-// CHECK: define noundef i32 @"?shr32{{[@$?.A-Za-z0-9_]+}}"(i32 noundef %V, i32 noundef %S) #0 {
+// CHECK-LABEL: define noundef i32 @_Z5shr32ii(i32 noundef %V, i32 noundef %S) #0 {
 // CHECK-DAG:  %[[Masked:.*]] = and i32 %{{.*}}, 31
 // CHECK-DAG:  %{{.*}} = ashr i32 %{{.*}}, %[[Masked]]
 
@@ -22,7 +21,7 @@ int64_t shl64(int64_t V, int64_t S) {
   return V << S;
 }
 
-// CHECK: define noundef i64 @"?shl64{{[@$?.A-Za-z0-9_]+}}"(i64 noundef %V, i64 noundef %S) #0 {
+// CHECK-LABEL: define noundef i64 @_Z5shl64ll(i64 noundef %V, i64 noundef %S) #0 {
 // CHECK-DAG:  %[[Masked:.*]] = and i64 %{{.*}}, 63
 // CHECK-DAG:  %{{.*}} = shl i64 %{{.*}}, %[[Masked]]
 
@@ -30,6 +29,38 @@ int64_t shr64(int64_t V, int64_t S) {
   return V >> S;
 }
 
-// CHECK: define noundef i64 @"?shr64{{[@$?.A-Za-z0-9_]+}}"(i64 noundef %V, i64 noundef %S) #0 {
+// CHECK-LABEL: define noundef i64 @_Z5shr64ll(i64 noundef %V, i64 noundef %S) #0 {
 // CHECK-DAG:  %[[Masked:.*]] = and i64 %{{.*}}, 63
 // CHECK-DAG:  %{{.*}} = ashr i64 %{{.*}}, %[[Masked]]
+
+uint shlu32(uint V, uint S) {
+  return V << S;
+}
+
+// CHECK-LABEL: define noundef i32 @_Z6shlu32jj(i32 noundef %V, i32 noundef %S) #0 {
+// CHECK-DAG:  %[[Masked:.*]] = and i32 %{{.*}}, 31
+// CHECK-DAG:  %{{.*}} = shl i32 %{{.*}}, %[[Masked]]
+
+uint shru32(uint V, uint S) {
+  return V >> S;
+}
+
+// CHECK-LABEL: define noundef i32 @_Z6shru32jj(i32 noundef %V, i32 noundef %S) #0 {
+// CHECK-DAG:  %[[Masked:.*]] = and i32 %{{.*}}, 31
+// CHECK-DAG:  %{{.*}} = lshr i32 %{{.*}}, %[[Masked]]
+
+uint64_t shlu64(uint64_t V, uint64_t S) {
+  return V << S;
+}
+
+// CHECK-LABEL: define noundef i64 @_Z6shlu64mm(i64 noundef %V, i64 noundef %S) #0 {
+// CHECK-DAG:  %[[Masked:.*]] = and i64 %{{.*}}, 63
+// CHECK-DAG:  %{{.*}} = shl i64 %{{.*}}, %[[Masked]]
+
+uint64_t shru64(uint64_t V, uint64_t S) {
+  return V >> S;
+}
+
+// CHECK-LABEL: define noundef i64 @_Z6shru64mm(i64 noundef %V, i64 noundef %S) #0 {
+// CHECK-DAG:  %[[Masked:.*]] = and i64 %{{.*}}, 63
+// CHECK-DAG:  %{{.*}} = lshr i64 %{{.*}}, %[[Masked]]
diff --git a/clang/test/CodeGenHLSL/sret_output.hlsl b/clang/test/CodeGenHLSL/sret_output.hlsl
index c44914f963a90f..c324790ba016df 100644
--- a/clang/test/CodeGenHLSL/sret_output.hlsl
+++ b/clang/test/CodeGenHLSL/sret_output.hlsl
@@ -1,5 +1,4 @@
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s  \
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s  \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
 // FIXME: add semantic to a.
@@ -10,10 +9,10 @@ struct S {
 
 
 // Make sure sret parameter is generated.
-// CHECK:define internal void @"?ps_main@@YA?AUS@@XZ"(ptr dead_on_unwind noalias writable sret(%struct.S) align 4 %agg.result)
+// CHECK:define internal void @_Z7ps_mainv(ptr dead_on_unwind noalias writable sret(%struct.S) align 4 %agg.result)
 // FIXME: change it to real value instead of poison value once semantic is add to a.
 // Make sure the function with sret is called.
-// CHECK:call void @"?ps_main@@YA?AUS@@XZ"(ptr poison)
+// CHECK:call void @_Z7ps_mainv(ptr poison)
 [shader("pixel")]
 S ps_main() {
   S s;
diff --git a/clang/test/CodeGenHLSL/static-local-ctor.hlsl b/clang/test/CodeGenHLSL/static-local-ctor.hlsl
index f55f6808672dea..eba37e3f4c6b83 100644
--- a/clang/test/CodeGenHLSL/static-local-ctor.hlsl
+++ b/clang/test/CodeGenHLSL/static-local-ctor.hlsl
@@ -13,16 +13,16 @@ void InitBuf(RWBuffer<int> buf) {
 }
 
 // CHECK-NOT: _Init_thread_epoch
-// CHECK: define internal void @"?main@@YAXXZ"
+// CHECK: define internal void @_Z4mainv
 // CHECK-NEXT: entry:
 // CHECK-NEXT: [[Tmp1:%.*]] = alloca %"class.hlsl::RWBuffer"
-// CHECK-NEXT: [[Tmp2:%.*]] = load i32, ptr
-// CHECK-NEXT: [[Tmp3:%.*]] = and i32 [[Tmp2]], 1
-// CHECK-NEXT: [[Tmp4:%.*]] = icmp eq i32 [[Tmp3]], 0
-// CHECK-NEXT: br i1 [[Tmp4]]
+// CHECK-NEXT: [[Tmp2:%.*]] = load i8, ptr @_ZGVZ4mainvE5mybuf
+// CHECK-NEXT: [[Tmp3:%.*]] = icmp eq i8 [[Tmp2]], 0
+// CHECK-NEXT: br i1 [[Tmp3]]
 // CHECK-NOT: _Init_thread_header
-// CHECK: init:
-// CHECK-NEXT: = or i32 [[Tmp2]], 1
+// CHECK: init.check:
+// CHECK-NEXT: call void @_ZN4hlsl8RWBufferIiEC1Ev
+// CHECK-NEXT: store i8 1, ptr @_ZGVZ4mainvE5mybuf
 // CHECK-NOT: _Init_thread_footer
 
 
diff --git a/clang/test/CodeGenHLSL/static_global_and_function_in_cb.hlsl b/clang/test/CodeGenHLSL/static_global_and_function_in_cb.hlsl
index eabd0faff6a87e..f85bab2113170b 100644
--- a/clang/test/CodeGenHLSL/static_global_and_function_in_cb.hlsl
+++ b/clang/test/CodeGenHLSL/static_global_and_function_in_cb.hlsl
@@ -1,15 +1,14 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s \
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
 // CHECK-DAG: @[[CB:.+]] = external constant { float }
 
 cbuffer A {
     float a;
-  // CHECK-DAG:@b = internal global float 3.000000e+00, align 4
+  // CHECK-DAG:@_ZL1b = internal global float 3.000000e+00, align 4
   static float b = 3;
   // CHECK:load float, ptr @[[CB]], align 4
-  // CHECK:load float, ptr @b, align 4
+  // CHECK:load float, ptr @_ZL1b, align 4
   float foo() { return a + b; }
 }
 
diff --git a/clang/test/CodeGenHLSL/this-assignment-overload.hlsl b/clang/test/CodeGenHLSL/this-assignment-overload.hlsl
index f0affcb69a3fcd..5a3bdc3d4d38ee 100644
--- a/clang/test/CodeGenHLSL/this-assignment-overload.hlsl
+++ b/clang/test/CodeGenHLSL/this-assignment-overload.hlsl
@@ -25,7 +25,7 @@ void main() {
 }
 
 // This test makes a probably safe assumption that HLSL 202x includes operator overloading for assignment operators.
-// CHECK:     define linkonce_odr noundef i32 @"?getFirst@Pair@@QAAHXZ"(ptr noundef nonnull align 4 dereferenceable(8) %this) #0 align 2 {
+// CHECK:     define linkonce_odr noundef i32 @_ZN4Pair8getFirstEv(ptr noundef nonnull align 4 dereferenceable(8) %this) #0 align 2 {
 // CHECK-NEXT:entry:
 // CHECK-NEXT:%this.addr = alloca ptr, align 4
 // CHECK-NEXT:%Another = alloca %struct.Pair, align 4
@@ -37,19 +37,19 @@ void main() {
 // CHECK-NEXT:%Second = getelementptr inbounds nuw %struct.Pair, ptr %Another, i32 0, i32 1
 // CHECK-NEXT:store i32 10, ptr %Second, align 4
 // CHECK-NEXT:call void @llvm.memcpy.p0.p0.i32(ptr align 4 %agg.tmp, ptr align 4 %Another, i32 8, i1 false)
-// CHECK-NEXT:call void @"??4Pair@@QAAXU0@@Z"(ptr noundef nonnull align 4 dereferenceable(8) %this1, ptr noundef byval(%struct.Pair) align 4 %agg.tmp)
+// CHECK-NEXT:call void @_ZN4PairaSES_(ptr noundef nonnull align 4 dereferenceable(8) %this1, ptr noundef byval(%struct.Pair) align 4 %agg.tmp)
 // CHECK-NEXT:%First2 = getelementptr inbounds nuw %struct.Pair, ptr %this1, i32 0, i32 0
 // CHECK-NEXT:%0 = load i32, ptr %First2, align 4
 // CHECK-NEXT:ret i32 %0
 
-// CHECK:     define linkonce_odr noundef i32 @"?getSecond@Pair@@QAAHXZ"(ptr noundef nonnull align 4 dereferenceable(8) %this) #0 align 2 {
+// CHECK:     define linkonce_odr noundef i32 @_ZN4Pair9getSecondEv(ptr noundef nonnull align 4 dereferenceable(8) %this) #0 align 2 {
 // CHECK-NEXT:entry:
 // CHECK-NEXT:%this.addr = alloca ptr, align 4
 // CHECK-NEXT:%agg.tmp = alloca %struct.Pair, align 4
 // CHECK-NEXT:store ptr %this, ptr %this.addr, align 4
 // CHECK-NEXT:%this1 = load ptr, ptr %this.addr, align 4
 // CHECK-NEXT:call void @llvm.memset.p0.i32(ptr align 4 %agg.tmp, i8 0, i32 8, i1 false)
-// CHECK-NEXT:call void @"??4Pair@@QAAXU0@@Z"(ptr noundef nonnull align 4 dereferenceable(8) %this1, ptr noundef byval(%struct.Pair) align 4 %agg.tmp)
+// CHECK-NEXT:call void @_ZN4PairaSES_(ptr noundef nonnull align 4 dereferenceable(8) %this1, ptr noundef byval(%struct.Pair) align 4 %agg.tmp)
 // CHECK-NEXT:%Second = getelementptr inbounds nuw %struct.Pair, ptr %this1, i32 0, i32 1
 // CHECK-NEXT:%0 = load i32, ptr %Second, align 4
 // CHECK-NEXT:ret i32 %0
diff --git a/clang/test/CodeGenHLSL/this-assignment.hlsl b/clang/test/CodeGenHLSL/this-assignment.hlsl
index 7408d199910e5c..72bd2f8e70af8f 100644
--- a/clang/test/CodeGenHLSL/this-assignment.hlsl
+++ b/clang/test/CodeGenHLSL/this-assignment.hlsl
@@ -1,5 +1,4 @@
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -emit-llvm -disable-llvm-passes -o - -hlsl-entry main %s | FileCheck %s
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -std=hlsl202x -emit-llvm -disable-llvm-passes -o - -hlsl-entry main %s | FileCheck %s
 
 struct Pair {
   int First;
@@ -40,7 +39,7 @@ void main() {
 // CHECK-NEXT:%Another = alloca %struct.Pair, align 4
 // CHECK-NEXT:store ptr %this, ptr %this.addr, align 4
 // CHECK-NEXT:%this1 = load ptr, ptr %this.addr, align 4
-// CHECK-NEXT:call void @llvm.memcpy.p0.p0.i32(ptr align 4 %Another, ptr align 4 @"__const.?getFirst@Pair@@QAAHXZ.Another", i32 8, i1 false)
+// CHECK-NEXT:call void @llvm.memcpy.p0.p0.i32(ptr align 4 %Another, ptr align 4 @__const._ZN4Pair8getFirstEv.Another, i32 8, i1 false)
 // CHECK-NEXT:call void @llvm.memcpy.p0.p0.i32(ptr align 4 %this1, ptr align 4 %Another, i32 8, i1 false)
 // CHECK-NEXT:%First = getelementptr inbounds nuw %struct.Pair, ptr %this1, i32 0, i32 0
 
@@ -56,9 +55,7 @@ void main() {
 
 // CHECK-LABEL:     define {{.*}}DoSilly
 // CHECK-NEXT:entry:
-// CHECK-NEXT: [[ResPtr:%.*]] = alloca ptr
 // CHECK-NEXT: [[ThisPtrAddr:%.*]] = alloca ptr
-// CHECK-NEXT: store ptr [[AggRes:%.*]], ptr [[ResPtr]]
 // CHECK-NEXT: store ptr {{.*}}, ptr [[ThisPtrAddr]]
 // CHECK-NEXT: [[ThisPtr:%.*]] = load ptr, ptr [[ThisPtrAddr]]
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[ThisPtr]], ptr align 4 [[Obj:%.*]], i32 8, i1 false)
@@ -66,4 +63,4 @@ void main() {
 // CHECK-NEXT: [[First:%.*]] = load i32, ptr [[FirstAddr]]
 // CHECK-NEXT: [[FirstPlusTwo:%.*]] = add nsw i32 [[First]], 2
 // CHECK-NEXT: store i32 [[FirstPlusTwo]], ptr [[FirstAddr]]
-// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[AggRes]], ptr align 4 [[Obj]], i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 {{.*}}, ptr align 4 [[Obj]], i32 8, i1 false)
diff --git a/clang/test/CodeGenHLSL/this-reference.hlsl b/clang/test/CodeGenHLSL/this-reference.hlsl
index 032ee34ec65d3b..66b79d42500122 100644
--- a/clang/test/CodeGenHLSL/this-reference.hlsl
+++ b/clang/test/CodeGenHLSL/this-reference.hlsl
@@ -21,10 +21,10 @@ void main() {
 }
 
 // This tests reference like `this` in HLSL
-  // CHECK:       %call = call noundef i32 @"?getFirst@Pair@@QAAHXZ"(ptr noundef nonnull align 4 dereferenceable(8) %Vals)
+  // CHECK:       %call = call noundef i32 @_ZN4Pair8getFirstEv(ptr noundef nonnull align 4 dereferenceable(8) %Vals)
   // CHECK-NEXT:  %First = getelementptr inbounds nuw %struct.Pair, ptr %Vals, i32 0, i32 0
   // CHECK-NEXT:  store i32 %call, ptr %First, align 4
-  // CHECK-NEXT:  %call1 = call noundef float @"?getSecond@Pair@@QAAMXZ"(ptr noundef nonnull align 4 dereferenceable(8) %Vals)
+  // CHECK-NEXT:  %call1 = call noundef float @_ZN4Pair9getSecondEv(ptr noundef nonnull align 4 dereferenceable(8) %Vals)
   // CHECK-NEXT:  %Second = getelementptr inbounds nuw %struct.Pair, ptr %Vals, i32 0, i32 1
 
 // CHECK: [[Pair:![0-9]+]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Pair"

From 747d8f3fc93d912183059142631a343fb20bd07f Mon Sep 17 00:00:00 2001
From: vporpo <vporpodas@google.com>
Date: Thu, 10 Oct 2024 12:01:56 -0700
Subject: [PATCH 063/177] [SandboxVec][DAG] Implement PredIterator (#111604)

This patch implements an iterator for iterating over both use-def and
mem dependencies of MemDGNodes.
---
 .../SandboxVectorizer/DependencyGraph.h       | 73 +++++++++++++++++++
 .../SandboxVectorizer/DependencyGraph.cpp     | 44 +++++++++++
 .../SandboxVectorizer/DependencyGraphTest.cpp | 41 +++++++++++
 3 files changed, 158 insertions(+)

diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h
index 134adc4b21ab12..eba6d7562e41de 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h
@@ -40,6 +40,54 @@ enum class DGNodeID {
   MemDGNode,
 };
 
+class DGNode;
+class MemDGNode;
+class DependencyGraph;
+
+/// While OpIt points to a Value that is not an Instruction keep incrementing
+/// it. \Returns the first iterator that points to an Instruction, or end.
+[[nodiscard]] static User::op_iterator skipNonInstr(User::op_iterator OpIt,
+                                                    User::op_iterator OpItE) {
+  while (OpIt != OpItE && !isa<Instruction>((*OpIt).get()))
+    ++OpIt;
+  return OpIt;
+}
+
+/// Iterate over both def-use and mem dependencies.
+class PredIterator {
+  User::op_iterator OpIt;
+  User::op_iterator OpItE;
+  DenseSet<MemDGNode *>::iterator MemIt;
+  DGNode *N = nullptr;
+  DependencyGraph *DAG = nullptr;
+
+  PredIterator(const User::op_iterator &OpIt, const User::op_iterator &OpItE,
+               const DenseSet<MemDGNode *>::iterator &MemIt, DGNode *N,
+               DependencyGraph &DAG)
+      : OpIt(OpIt), OpItE(OpItE), MemIt(MemIt), N(N), DAG(&DAG) {}
+  PredIterator(const User::op_iterator &OpIt, const User::op_iterator &OpItE,
+               DGNode *N, DependencyGraph &DAG)
+      : OpIt(OpIt), OpItE(OpItE), N(N), DAG(&DAG) {}
+  friend class DGNode;    // For constructor
+  friend class MemDGNode; // For constructor
+
+public:
+  using difference_type = std::ptrdiff_t;
+  using value_type = DGNode *;
+  using pointer = value_type *;
+  using reference = value_type &;
+  using iterator_category = std::input_iterator_tag;
+  value_type operator*();
+  PredIterator &operator++();
+  PredIterator operator++(int) {
+    auto Copy = *this;
+    ++(*this);
+    return Copy;
+  }
+  bool operator==(const PredIterator &Other) const;
+  bool operator!=(const PredIterator &Other) const { return !(*this == Other); }
+};
+
 /// A DependencyGraph Node that points to an Instruction and contains memory
 /// dependency edges.
 class DGNode {
@@ -63,6 +111,23 @@ class DGNode {
   virtual ~DGNode() = default;
   /// \Returns true if this is before \p Other in program order.
   bool comesBefore(const DGNode *Other) { return I->comesBefore(Other->I); }
+  using iterator = PredIterator;
+  virtual iterator preds_begin(DependencyGraph &DAG) {
+    return PredIterator(skipNonInstr(I->op_begin(), I->op_end()), I->op_end(),
+                        this, DAG);
+  }
+  virtual iterator preds_end(DependencyGraph &DAG) {
+    return PredIterator(I->op_end(), I->op_end(), this, DAG);
+  }
+  iterator preds_begin(DependencyGraph &DAG) const {
+    return const_cast<DGNode *>(this)->preds_begin(DAG);
+  }
+  iterator preds_end(DependencyGraph &DAG) const {
+    return const_cast<DGNode *>(this)->preds_end(DAG);
+  }
+  iterator_range<iterator> preds(DependencyGraph &DAG) const {
+    return make_range(preds_begin(DAG), preds_end(DAG));
+  }
 
   static bool isStackSaveOrRestoreIntrinsic(Instruction *I) {
     if (auto *II = dyn_cast<IntrinsicInst>(I)) {
@@ -145,6 +210,14 @@ class MemDGNode final : public DGNode {
   static bool classof(const DGNode *Other) {
     return Other->SubclassID == DGNodeID::MemDGNode;
   }
+  iterator preds_begin(DependencyGraph &DAG) override {
+    auto OpEndIt = I->op_end();
+    return PredIterator(skipNonInstr(I->op_begin(), OpEndIt), OpEndIt,
+                        MemPreds.begin(), this, DAG);
+  }
+  iterator preds_end(DependencyGraph &DAG) override {
+    return PredIterator(I->op_end(), I->op_end(), MemPreds.end(), this, DAG);
+  }
   /// \Returns the previous Mem DGNode in instruction order.
   MemDGNode *getPrevNode() const { return PrevMemN; }
   /// \Returns the next Mem DGNode in instruction order.
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp
index 82f253d4c63231..7aea466ed6d8db 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp
@@ -8,10 +8,54 @@
 
 #include "llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/SandboxIR/Instruction.h"
 #include "llvm/SandboxIR/Utils.h"
 
 namespace llvm::sandboxir {
 
+PredIterator::value_type PredIterator::operator*() {
+  // If it's a DGNode then we dereference the operand iterator.
+  if (!isa<MemDGNode>(N)) {
+    assert(OpIt != OpItE && "Can't dereference end iterator!");
+    return DAG->getNode(cast<Instruction>((Value *)*OpIt));
+  }
+  // It's a MemDGNode, so we check if we return either the use-def operand,
+  // or a mem predecessor.
+  if (OpIt != OpItE)
+    return DAG->getNode(cast<Instruction>((Value *)*OpIt));
+  assert(MemIt != cast<MemDGNode>(N)->memPreds().end() &&
+         "Cant' dereference end iterator!");
+  return *MemIt;
+}
+
+PredIterator &PredIterator::operator++() {
+  // If it's a DGNode then we increment the use-def iterator.
+  if (!isa<MemDGNode>(N)) {
+    assert(OpIt != OpItE && "Already at end!");
+    ++OpIt;
+    // Skip operands that are not instructions.
+    OpIt = skipNonInstr(OpIt, OpItE);
+    return *this;
+  }
+  // It's a MemDGNode, so if we are not at the end of the use-def iterator we
+  // need to first increment that.
+  if (OpIt != OpItE) {
+    ++OpIt;
+    // Skip operands that are not instructions.
+    OpIt = skipNonInstr(OpIt, OpItE);
+    return *this;
+  }
+  assert(MemIt != cast<MemDGNode>(N)->memPreds().end() && "Already at end!");
+  ++MemIt;
+  return *this;
+}
+
+bool PredIterator::operator==(const PredIterator &Other) const {
+  assert(DAG == Other.DAG && "Iterators of different DAGs!");
+  assert(N == Other.N && "Iterators of different nodes!");
+  return OpIt == Other.OpIt && MemIt == Other.MemIt;
+}
+
 #ifndef NDEBUG
 void DGNode::print(raw_ostream &OS, bool PrintDeps) const {
   I->dumpOS(OS);
diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp
index e2f16919a5cddd..6b3d9cc77c9955 100644
--- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp
@@ -240,12 +240,53 @@ define void @foo(ptr %ptr, i8 %v0, i8 %v1) {
   EXPECT_TRUE(N1->hasMemPred(N0));
   EXPECT_FALSE(N0->hasMemPred(N1));
 
+  // Check preds().
+  EXPECT_TRUE(N0->preds(DAG).empty());
+  EXPECT_THAT(N1->preds(DAG), testing::ElementsAre(N0));
+
   // Check memPreds().
   EXPECT_TRUE(N0->memPreds().empty());
   EXPECT_THAT(N1->memPreds(), testing::ElementsAre(N0));
   EXPECT_TRUE(N2->memPreds().empty());
 }
 
+TEST_F(DependencyGraphTest, Preds) {
+  parseIR(C, R"IR(
+declare ptr @bar(i8)
+define i8 @foo(i8 %v0, i8 %v1) {
+  %add0 = add i8 %v0, %v0
+  %add1 = add i8 %v1, %v1
+  %add2 = add i8 %add0, %add1
+  %ptr = call ptr @bar(i8 %add1)
+  store i8 %add2, ptr %ptr
+  ret i8 %add2
+}
+)IR");
+  llvm::Function *LLVMF = &*M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  auto *F = Ctx.createFunction(LLVMF);
+  auto *BB = &*F->begin();
+  auto It = BB->begin();
+  sandboxir::DependencyGraph DAG(getAA(*LLVMF));
+  DAG.extend({&*BB->begin(), BB->getTerminator()});
+
+  auto *AddN0 = DAG.getNode(cast<sandboxir::BinaryOperator>(&*It++));
+  auto *AddN1 = DAG.getNode(cast<sandboxir::BinaryOperator>(&*It++));
+  auto *AddN2 = DAG.getNode(cast<sandboxir::BinaryOperator>(&*It++));
+  auto *CallN = DAG.getNode(cast<sandboxir::CallInst>(&*It++));
+  auto *StN = DAG.getNode(cast<sandboxir::StoreInst>(&*It++));
+  auto *RetN = DAG.getNode(cast<sandboxir::ReturnInst>(&*It++));
+
+  // Check preds().
+  EXPECT_THAT(AddN0->preds(DAG), testing::ElementsAre());
+  EXPECT_THAT(AddN1->preds(DAG), testing::ElementsAre());
+  EXPECT_THAT(AddN2->preds(DAG), testing::ElementsAre(AddN0, AddN1));
+  EXPECT_THAT(CallN->preds(DAG), testing::ElementsAre(AddN1));
+  EXPECT_THAT(StN->preds(DAG),
+              testing::UnorderedElementsAre(CallN, CallN, AddN2));
+  EXPECT_THAT(RetN->preds(DAG), testing::ElementsAre(AddN2));
+}
+
 TEST_F(DependencyGraphTest, MemDGNode_getPrevNode_getNextNode) {
   parseIR(C, R"IR(
 define void @foo(ptr %ptr, i8 %v0, i8 %v1) {

From bb937e276da11c6d85318b32006f6510877c1a2c Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Thu, 10 Oct 2024 20:04:46 +0100
Subject: [PATCH 064/177] [LV] Compute value of escaped induction based on the
 computed end value. (#110576)

Update fixupIVUsers to compute the value for escaped inductions using
the already computed end value of the induction (EndValue), but
subtracting the step.

This results in slightly simpler codegen, as we avoid computing the full
transformed index at VectorTripCount - 1.

PR: https://github.com/llvm/llvm-project/pull/110576
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 21 ++++++++----
 .../AArch64/sve-live-out-pointer-induction.ll |  4 +--
 .../LoopVectorize/X86/float-induction-x86.ll  | 14 ++++----
 .../LoopVectorize/iv_outside_user.ll          |  8 ++---
 ...o-fold-tail-by-masking-iv-external-uses.ll |  3 +-
 .../LoopVectorize/pr58811-scev-expansion.ll   | 34 +++++++++----------
 6 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index db650b23e271e2..f2bee2c67a2353 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2747,17 +2747,24 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
       if (isa_and_nonnull<FPMathOperator>(II.getInductionBinOp()))
         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
 
-      Value *CountMinusOne = B.CreateSub(
-          VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
-      CountMinusOne->setName("cmo");
-
       VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
       assert(StepVPV && "step must have been expanded during VPlan execution");
       Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
                                         : State.get(StepVPV, VPLane(0));
-      Value *Escape =
-          emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step,
-                               II.getKind(), II.getInductionBinOp());
+      Value *Escape = nullptr;
+      if (EndValue->getType()->isIntegerTy())
+        Escape = B.CreateSub(EndValue, Step);
+      else if (EndValue->getType()->isPointerTy())
+        Escape = B.CreatePtrAdd(EndValue, B.CreateNeg(Step));
+      else if (EndValue->getType()->isFloatingPointTy()) {
+        Escape = B.CreateBinOp(II.getInductionBinOp()->getOpcode() ==
+                                       Instruction::FAdd
+                                   ? Instruction::FSub
+                                   : Instruction::FAdd,
+                               EndValue, Step);
+      } else {
+        llvm_unreachable("all possible induction types must be handled");
+      }
       Escape->setName("ind.escape");
       MissingVals[UI] = Escape;
     }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll
index c28776e82776b7..64b69be5f52598 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll
@@ -42,9 +42,7 @@ define ptr @test(ptr %start.1, ptr %start.2, ptr %end) {
 ; CHECK-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
-; CHECK-NEXT:    [[CMO:%.*]] = sub i64 [[N_VEC]], 1
-; CHECK-NEXT:    [[TMP37:%.*]] = mul i64 [[CMO]], 8
-; CHECK-NEXT:    [[IND_ESCAPE:%.*]] = getelementptr i8, ptr [[START_1]], i64 [[TMP37]]
+; CHECK-NEXT:    [[IND_ESCAPE:%.*]] = getelementptr i8, ptr [[IND_END]], i64 -8
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[START_1]], [[ENTRY:%.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
index cb4f5f6d9eabaf..54dd9c870a1709 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
@@ -208,25 +208,23 @@ define double @external_use_with_fast_math(ptr %a, i64 %n) {
 ; AUTO_VEC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AUTO_VEC-NEXT:    [[VEC_IND:%.*]] = phi <4 x double> [ <double 0.000000e+00, double 3.000000e+00, double 6.000000e+00, double 9.000000e+00>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AUTO_VEC-NEXT:    [[STEP_ADD:%.*]] = fadd fast <4 x double> [[VEC_IND]], <double 1.200000e+01, double 1.200000e+01, double 1.200000e+01, double 1.200000e+01>
-; AUTO_VEC-NEXT:    [[STEP_ADD2:%.*]] = fadd fast <4 x double> [[VEC_IND]], <double 2.400000e+01, double 2.400000e+01, double 2.400000e+01, double 2.400000e+01>
-; AUTO_VEC-NEXT:    [[STEP_ADD3:%.*]] = fadd fast <4 x double> [[VEC_IND]], <double 3.600000e+01, double 3.600000e+01, double 3.600000e+01, double 3.600000e+01>
+; AUTO_VEC-NEXT:    [[STEP_ADD_2:%.*]] = fadd fast <4 x double> [[VEC_IND]], <double 2.400000e+01, double 2.400000e+01, double 2.400000e+01, double 2.400000e+01>
+; AUTO_VEC-NEXT:    [[STEP_ADD_3:%.*]] = fadd fast <4 x double> [[VEC_IND]], <double 3.600000e+01, double 3.600000e+01, double 3.600000e+01, double 3.600000e+01>
 ; AUTO_VEC-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[A:%.*]], i64 [[INDEX]]
 ; AUTO_VEC-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i64 32
 ; AUTO_VEC-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i64 64
 ; AUTO_VEC-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP1]], i64 96
 ; AUTO_VEC-NEXT:    store <4 x double> [[VEC_IND]], ptr [[TMP1]], align 8
 ; AUTO_VEC-NEXT:    store <4 x double> [[STEP_ADD]], ptr [[TMP2]], align 8
-; AUTO_VEC-NEXT:    store <4 x double> [[STEP_ADD2]], ptr [[TMP3]], align 8
-; AUTO_VEC-NEXT:    store <4 x double> [[STEP_ADD3]], ptr [[TMP4]], align 8
+; AUTO_VEC-NEXT:    store <4 x double> [[STEP_ADD_2]], ptr [[TMP3]], align 8
+; AUTO_VEC-NEXT:    store <4 x double> [[STEP_ADD_3]], ptr [[TMP4]], align 8
 ; AUTO_VEC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AUTO_VEC-NEXT:    [[VEC_IND_NEXT]] = fadd fast <4 x double> [[VEC_IND]], <double 4.800000e+01, double 4.800000e+01, double 4.800000e+01, double 4.800000e+01>
 ; AUTO_VEC-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; AUTO_VEC-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; AUTO_VEC:       middle.block:
 ; AUTO_VEC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
-; AUTO_VEC-NEXT:    [[CMO:%.*]] = add nsw i64 [[N_VEC]], -1
-; AUTO_VEC-NEXT:    [[DOTCAST6:%.*]] = sitofp i64 [[CMO]] to double
-; AUTO_VEC-NEXT:    [[TMP6:%.*]] = fmul fast double [[DOTCAST6]], 3.000000e+00
+; AUTO_VEC-NEXT:    [[IND_ESCAPE:%.*]] = fadd fast double [[TMP0]], -3.000000e+00
 ; AUTO_VEC-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; AUTO_VEC:       for.body:
 ; AUTO_VEC-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
@@ -238,7 +236,7 @@ define double @external_use_with_fast_math(ptr %a, i64 %n) {
 ; AUTO_VEC-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[I_NEXT]], [[SMAX]]
 ; AUTO_VEC-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; AUTO_VEC:       for.end:
-; AUTO_VEC-NEXT:    [[J_LCSSA:%.*]] = phi double [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ [[J]], [[FOR_BODY]] ]
+; AUTO_VEC-NEXT:    [[J_LCSSA:%.*]] = phi double [ [[IND_ESCAPE]], [[MIDDLE_BLOCK]] ], [ [[J]], [[FOR_BODY]] ]
 ; AUTO_VEC-NEXT:    ret double [[J_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
index bf27c146ec9ce1..02fdbc05ed5188 100644
--- a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
@@ -63,7 +63,7 @@ for.end:
 
 ; CHECK-LABEL: @geppre
 ; CHECK-LABEL: middle.block:
-; CHECK: %ind.escape = getelementptr i8, ptr %ptr, i64 496
+; CHECK: %ind.escape = getelementptr i8, ptr %ind.end, i64 -16
 ; CHECK-LABEL: for.end:
 ; CHECK: %[[RET:.*]] = phi ptr [ {{.*}}, %for.body ], [ %ind.escape, %middle.block ]
 ; CHECK: ret ptr %[[RET]]
@@ -85,9 +85,7 @@ for.end:
 
 ; CHECK-LABEL: @both
 ; CHECK-LABEL: middle.block:
-; CHECK: %[[END:.*]] = sub i64 %n.vec, 1
-; CHECK: %[[END_OFFSET:.*]] = mul i64 %[[END]], 4
-; CHECK: %ind.escape = getelementptr i8, ptr %base, i64 %[[END_OFFSET]]
+; CHECK: %ind.escape = getelementptr i8, ptr %ind.end1, i64 -4
 ; CHECK-LABEL: for.end:
 ; CHECK: %[[RET:.*]] = phi ptr [ %inc.lag1, %for.body ], [ %ind.escape, %middle.block ]
 ; CHECK: ret ptr %[[RET]]
@@ -142,7 +140,7 @@ for.end:
 ; CHECK:   %[[N_VEC:.+]] = sub i32 %[[T5]], %[[N_MOD_VF]]
 ; CHECK: middle.block
 ; CHECK:   %[[CMP:.+]] = icmp eq i32 %[[T5]], %[[N_VEC]]
-; CHECK:   %ind.escape = add i32 %[[T15]],
+; CHECK:   %ind.escape = sub i32 %ind.end8, -8
 ; CHECK:   br i1 %[[CMP]], label %BB3, label %scalar.ph
 define void @PR30742() {
 BB0:
diff --git a/llvm/test/Transforms/LoopVectorize/no-fold-tail-by-masking-iv-external-uses.ll b/llvm/test/Transforms/LoopVectorize/no-fold-tail-by-masking-iv-external-uses.ll
index 80a6bb50ca91b6..d462d3aa650d28 100644
--- a/llvm/test/Transforms/LoopVectorize/no-fold-tail-by-masking-iv-external-uses.ll
+++ b/llvm/test/Transforms/LoopVectorize/no-fold-tail-by-masking-iv-external-uses.ll
@@ -51,8 +51,7 @@ define i32 @test(ptr %arr, i64 %n) {
 ; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
-; CHECK-NEXT:    [[CMO:%.*]] = sub i64 [[N_VEC]], 1
-; CHECK-NEXT:    [[IND_ESCAPE:%.*]] = add i64 1, [[CMO]]
+; CHECK-NEXT:    [[IND_ESCAPE:%.*]] = sub i64 [[IND_END]], 1
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOAD_VAL:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ], [ 1, [[VECTOR_SCEVCHECK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/pr58811-scev-expansion.ll b/llvm/test/Transforms/LoopVectorize/pr58811-scev-expansion.ll
index c0eb4ccdd6d7e5..af1c146c2c6c4c 100644
--- a/llvm/test/Transforms/LoopVectorize/pr58811-scev-expansion.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr58811-scev-expansion.ll
@@ -28,10 +28,10 @@ define void @test1_pr58811() {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 196
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 196
+; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[IND_ESCAPE:%.*]] = mul i32 195, [[INDUCTION_IV_LCSSA]]
+; CHECK-NEXT:    [[IND_ESCAPE:%.*]] = sub i32 [[IND_END]], [[INDUCTION_IV_LCSSA]]
 ; CHECK-NEXT:    br i1 false, label [[LOOP_3_PREHEADER:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 196, [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_2_PREHEADER]] ]
@@ -123,28 +123,28 @@ define void @test2_pr58811() {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 196
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 196
+; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[IND_ESCAPE:%.*]] = mul i32 195, [[INDUCTION_IV_LCSSA]]
+; CHECK-NEXT:    [[IND_ESCAPE:%.*]] = sub i32 [[IND_END]], [[INDUCTION_IV_LCSSA]]
 ; CHECK-NEXT:    br i1 false, label [[LOOP_4_PREHEADER:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 196, [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_3_PREHEADER]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_3_PREHEADER]] ]
 ; CHECK-NEXT:    br label [[LOOP_3:%.*]]
 ; CHECK:       loop.3:
-; CHECK-NEXT:    [[INT16_TINDARRAYSAFEVAR_186_0747_1:%.*]] = phi i16 [ [[INC_1:%.*]], [[LOOP_3]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[UINT32_TVAR_177_2745_1:%.*]] = phi i32 [ [[SUB93_1:%.*]], [[LOOP_3]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[SUB93_1]] = sub i32 [[UINT32_TVAR_177_2745_1]], [[IV_2_LCSSA]]
-; CHECK-NEXT:    [[INC_1]] = add i16 [[INT16_TINDARRAYSAFEVAR_186_0747_1]], 1
-; CHECK-NEXT:    [[CMP88_1:%.*]] = icmp ult i16 [[INT16_TINDARRAYSAFEVAR_186_0747_1]], 198
+; CHECK-NEXT:    [[IV_4:%.*]] = phi i16 [ [[INC_1:%.*]], [[LOOP_3]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[IV_5:%.*]] = phi i32 [ [[SUB93_1:%.*]], [[LOOP_3]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[SUB93_1]] = sub i32 [[IV_5]], [[IV_2_LCSSA]]
+; CHECK-NEXT:    [[INC_1]] = add i16 [[IV_4]], 1
+; CHECK-NEXT:    [[CMP88_1:%.*]] = icmp ult i16 [[IV_4]], 198
 ; CHECK-NEXT:    br i1 [[CMP88_1]], label [[LOOP_3]], label [[LOOP_4_PREHEADER]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       loop.4.preheader:
-; CHECK-NEXT:    [[UINT32_TVAR_177_2745_1_LCSSA:%.*]] = phi i32 [ [[UINT32_TVAR_177_2745_1]], [[LOOP_3]] ], [ [[IND_ESCAPE]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[IV_5_LCSSA:%.*]] = phi i32 [ [[IV_5]], [[LOOP_3]] ], [ [[IND_ESCAPE]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP_4]]
 ; CHECK:       loop.4:
-; CHECK-NEXT:    [[UINT32_TVAR_177_2745_2:%.*]] = phi i32 [ [[SUB93_2]], [[LOOP_4]] ], [ 0, [[LOOP_4_PREHEADER]] ]
-; CHECK-NEXT:    [[SUB93_2]] = sub i32 [[UINT32_TVAR_177_2745_2]], [[UINT32_TVAR_177_2745_1_LCSSA]]
+; CHECK-NEXT:    [[IV_6:%.*]] = phi i32 [ [[SUB93_2]], [[LOOP_4]] ], [ 0, [[LOOP_4_PREHEADER]] ]
+; CHECK-NEXT:    [[SUB93_2]] = sub i32 [[IV_6]], [[IV_5_LCSSA]]
 ; CHECK-NEXT:    br i1 false, label [[LOOP_4]], label [[LOOP_1_HEADER_LOOPEXIT]]
 ;
 entry:
@@ -201,10 +201,10 @@ define void @test3_pr58811() {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 196
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 196
+; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[IND_ESCAPE:%.*]] = mul i32 195, [[TMP3]]
+; CHECK-NEXT:    [[IND_ESCAPE:%.*]] = sub i32 [[IND_END]], [[TMP3]]
 ; CHECK-NEXT:    br i1 false, label [[LOOP_4_PREHEADER:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 196, [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_3_PREHEADER]] ]

From 125262312f366bd776b668b24026dbbc8e6b4c75 Mon Sep 17 00:00:00 2001
From: Tyler Nowicki <tyler.nowicki@amd.com>
Date: Thu, 10 Oct 2024 15:11:27 -0400
Subject: [PATCH 065/177] [Coroutines] Improve use of unique_ptr (#111870)

* Replace usage of unique_ptr<>(new ...) -> make_unique<>();
---
 llvm/lib/Transforms/Coroutines/CoroSplit.cpp         | 12 ++++--------
 .../Transforms/Coroutines/ExtraRematTest.cpp         |  2 +-
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index 88ce331c8cfb64..0395ee62ae988b 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -2211,17 +2211,13 @@ CreateNewABI(Function &F, coro::Shape &S,
 
   switch (S.ABI) {
   case coro::ABI::Switch:
-    return std::unique_ptr<coro::BaseABI>(
-        new coro::SwitchABI(F, S, IsMatCallback));
+    return std::make_unique<coro::SwitchABI>(F, S, IsMatCallback);
   case coro::ABI::Async:
-    return std::unique_ptr<coro::BaseABI>(
-        new coro::AsyncABI(F, S, IsMatCallback));
+    return std::make_unique<coro::AsyncABI>(F, S, IsMatCallback);
   case coro::ABI::Retcon:
-    return std::unique_ptr<coro::BaseABI>(
-        new coro::AnyRetconABI(F, S, IsMatCallback));
+    return std::make_unique<coro::AnyRetconABI>(F, S, IsMatCallback);
   case coro::ABI::RetconOnce:
-    return std::unique_ptr<coro::BaseABI>(
-        new coro::AnyRetconABI(F, S, IsMatCallback));
+    return std::make_unique<coro::AnyRetconABI>(F, S, IsMatCallback);
   }
   llvm_unreachable("Unknown ABI");
 }
diff --git a/llvm/unittests/Transforms/Coroutines/ExtraRematTest.cpp b/llvm/unittests/Transforms/Coroutines/ExtraRematTest.cpp
index c3394fdaa940ba..68bf640334b5f2 100644
--- a/llvm/unittests/Transforms/Coroutines/ExtraRematTest.cpp
+++ b/llvm/unittests/Transforms/Coroutines/ExtraRematTest.cpp
@@ -247,7 +247,7 @@ TEST_F(ExtraRematTest, TestCoroRematWithCustomABI) {
   ASSERT_TRUE(M);
 
   CoroSplitPass::BaseABITy GenCustomABI = [](Function &F, coro::Shape &S) {
-    return std::unique_ptr<coro::BaseABI>(new ExtraCustomABI(F, S));
+    return std::make_unique<ExtraCustomABI>(F, S);
   };
 
   CGSCCPassManager CGPM;

From f6e93b8147a94a595293b47c39d20d2038c812d1 Mon Sep 17 00:00:00 2001
From: Changpeng Fang <changpeng.fang@amd.com>
Date: Thu, 10 Oct 2024 12:13:36 -0700
Subject: [PATCH 066/177] AMDGPU: Minor improvement and cleanup for waterfall
 loop generation (#111886)

First, ReadlanePieces should be in the scope of each MachineOperand. It
is not correct if we declare in a outer scope without clearing after the
use for a MachineOperand.
Additionally, we do not need the OrigBB argyment for
emitLoadScalarOpsFromVGPRLoop, since MachineFunction (the only use) can
be obtained from LoopBB (or BodyBB).
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 0c2ae382f53a19..d676d561d08180 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6302,11 +6302,14 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
 // Emit the actual waterfall loop, executing the wrapped instruction for each
 // unique value of \p ScalarOps across all lanes. In the best case we execute 1
 // iteration, in the worst case we execute 64 (once per lane).
-static void emitLoadScalarOpsFromVGPRLoop(
-    const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB,
-    MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL,
-    ArrayRef<MachineOperand *> ScalarOps) {
-  MachineFunction &MF = *OrigBB.getParent();
+static void
+emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII,
+                              MachineRegisterInfo &MRI,
+                              MachineBasicBlock &LoopBB,
+                              MachineBasicBlock &BodyBB,
+                              const DebugLoc &DL,
+                              ArrayRef<MachineOperand *> ScalarOps) {
+  MachineFunction &MF = *LoopBB.getParent();
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
   unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
@@ -6319,8 +6322,6 @@ static void emitLoadScalarOpsFromVGPRLoop(
   const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
 
   MachineBasicBlock::iterator I = LoopBB.begin();
-
-  SmallVector<Register, 8> ReadlanePieces;
   Register CondReg;
 
   for (MachineOperand *ScalarOp : ScalarOps) {
@@ -6355,6 +6356,7 @@ static void emitLoadScalarOpsFromVGPRLoop(
       ScalarOp->setReg(CurReg);
       ScalarOp->setIsKill();
     } else {
+      SmallVector<Register, 8> ReadlanePieces;
       unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
       assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
              "Unhandled register size");
@@ -6535,7 +6537,7 @@ loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
     }
   }
 
-  emitLoadScalarOpsFromVGPRLoop(TII, MRI, MBB, *LoopBB, *BodyBB, DL, ScalarOps);
+  emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps);
 
   MachineBasicBlock::iterator First = RemainderBB->begin();
   // Restore SCC

From e34d614e7d8616f165f3f5d349db98d9924826f2 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Thu, 10 Oct 2024 15:28:46 -0400
Subject: [PATCH 067/177] [Passes] Remove -enable-infer-alignment-pass flag
 (#111873)

This flag has been on for a while without any complaints.
---
 llvm/lib/Passes/PassBuilderPipelines.cpp      |  8 ++-----
 .../InstCombineLoadStoreAlloca.cpp            | 23 -------------------
 2 files changed, 2 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 8f151a99b11709..0167d1058c3ac1 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -300,8 +300,6 @@ extern cl::opt<std::string> UseCtxProfile;
 
 namespace llvm {
 extern cl::opt<bool> EnableMemProfContextDisambiguation;
-
-extern cl::opt<bool> EnableInferAlignmentPass;
 } // namespace llvm
 
 PipelineTuningOptions::PipelineTuningOptions() {
@@ -1250,8 +1248,7 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
   FPM.addPass(LoopVectorizePass(
       LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization)));
 
-  if (EnableInferAlignmentPass)
-    FPM.addPass(InferAlignmentPass());
+  FPM.addPass(InferAlignmentPass());
   if (IsFullLTO) {
     // The vectorizer may have significantly shortened a loop body; unroll
     // again. Unroll small loops to hide loop backedge latency and saturate any
@@ -1369,8 +1366,7 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
     FPM.addPass(SROAPass(SROAOptions::PreserveCFG));
   }
 
-  if (EnableInferAlignmentPass)
-    FPM.addPass(InferAlignmentPass());
+  FPM.addPass(InferAlignmentPass());
   FPM.addPass(InstCombinePass());
 
   // This is needed for two reasons:
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 0b51845ab5e257..93d183837d6f43 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -37,13 +37,6 @@ static cl::opt<unsigned> MaxCopiedFromConstantUsers(
     cl::desc("Maximum users to visit in copy from constant transform"),
     cl::Hidden);
 
-namespace llvm {
-cl::opt<bool> EnableInferAlignmentPass(
-    "enable-infer-alignment-pass", cl::init(true), cl::Hidden, cl::ZeroOrMore,
-    cl::desc("Enable the InferAlignment pass, disabling alignment inference in "
-             "InstCombine"));
-}
-
 /// isOnlyCopiedFromConstantMemory - Recursively walk the uses of a (derived)
 /// pointer to an alloca.  Ignore any reads of the pointer, return false if we
 /// see any stores or other unknown uses.  If we see pointer arithmetic, keep
@@ -1010,14 +1003,6 @@ Instruction *InstCombinerImpl::visitLoadInst(LoadInst &LI) {
   if (Instruction *Res = combineLoadToOperationType(*this, LI))
     return Res;
 
-  if (!EnableInferAlignmentPass) {
-    // Attempt to improve the alignment.
-    Align KnownAlign = getOrEnforceKnownAlignment(
-        Op, DL.getPrefTypeAlign(LI.getType()), DL, &LI, &AC, &DT);
-    if (KnownAlign > LI.getAlign())
-      LI.setAlignment(KnownAlign);
-  }
-
   // Replace GEP indices if possible.
   if (Instruction *NewGEPI = replaceGEPIdxWithZero(*this, Op, LI))
     return replaceOperand(LI, 0, NewGEPI);
@@ -1358,14 +1343,6 @@ Instruction *InstCombinerImpl::visitStoreInst(StoreInst &SI) {
   if (combineStoreToValueType(*this, SI))
     return eraseInstFromFunction(SI);
 
-  if (!EnableInferAlignmentPass) {
-    // Attempt to improve the alignment.
-    const Align KnownAlign = getOrEnforceKnownAlignment(
-        Ptr, DL.getPrefTypeAlign(Val->getType()), DL, &SI, &AC, &DT);
-    if (KnownAlign > SI.getAlign())
-      SI.setAlignment(KnownAlign);
-  }
-
   // Try to canonicalize the stored type.
   if (unpackStoreToAggregate(*this, SI))
     return eraseInstFromFunction(SI);

From 07892aaf04032e7a18368bc8320f93f7d46ab20f Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Wed, 9 Oct 2024 11:18:01 -0700
Subject: [PATCH 068/177] [NFC][sanitizer] Clang format
 sanitizer_thread_registry.cpp

---
 .../sanitizer_thread_registry.cpp             | 40 ++++++++++++-------
 1 file changed, 25 insertions(+), 15 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_thread_registry.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_thread_registry.cpp
index 741e0731c41559..df04822b28851c 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_thread_registry.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_thread_registry.cpp
@@ -18,9 +18,16 @@
 namespace __sanitizer {
 
 ThreadContextBase::ThreadContextBase(u32 tid)
-    : tid(tid), unique_id(0), reuse_count(), os_id(0), user_id(0),
-      status(ThreadStatusInvalid), detached(false),
-      thread_type(ThreadType::Regular), parent_tid(0), next(0) {
+    : tid(tid),
+      unique_id(0),
+      reuse_count(),
+      os_id(0),
+      user_id(0),
+      status(ThreadStatusInvalid),
+      detached(false),
+      thread_type(ThreadType::Regular),
+      parent_tid(0),
+      next(0) {
   name[0] = '\0';
   atomic_store(&thread_destroyed, 0, memory_order_release);
 }
@@ -39,8 +46,7 @@ void ThreadContextBase::SetName(const char *new_name) {
 }
 
 void ThreadContextBase::SetDead() {
-  CHECK(status == ThreadStatusRunning ||
-        status == ThreadStatusFinished);
+  CHECK(status == ThreadStatusRunning || status == ThreadStatusFinished);
   status = ThreadStatusDead;
   user_id = 0;
   OnDead();
@@ -68,7 +74,8 @@ void ThreadContextBase::SetFinished() {
   // for a thread that never actually started.  In that case the thread
   // should go to ThreadStatusFinished regardless of whether it was created
   // as detached.
-  if (!detached || status == ThreadStatusCreated) status = ThreadStatusFinished;
+  if (!detached || status == ThreadStatusCreated)
+    status = ThreadStatusFinished;
   OnFinished();
 }
 
@@ -124,8 +131,10 @@ void ThreadRegistry::GetNumberOfThreads(uptr *total, uptr *running,
   ThreadRegistryLock l(this);
   if (total)
     *total = threads_.size();
-  if (running) *running = running_threads_;
-  if (alive) *alive = alive_threads_;
+  if (running)
+    *running = running_threads_;
+  if (alive)
+    *alive = alive_threads_;
 }
 
 uptr ThreadRegistry::GetMaxAliveThreads() {
@@ -150,8 +159,10 @@ u32 ThreadRegistry::CreateThread(uptr user_id, bool detached, u32 parent_tid,
     Report("%s: Thread limit (%u threads) exceeded. Dying.\n",
            SanitizerToolName, max_threads_);
 #else
-    Printf("race: limit on %u simultaneously alive goroutines is exceeded,"
-        " dying\n", max_threads_);
+    Printf(
+        "race: limit on %u simultaneously alive goroutines is exceeded,"
+        " dying\n",
+        max_threads_);
 #endif
     Die();
   }
@@ -170,8 +181,7 @@ u32 ThreadRegistry::CreateThread(uptr user_id, bool detached, u32 parent_tid,
     // positives later (e.g. if we join a wrong thread).
     CHECK(live_.try_emplace(user_id, tid).second);
   }
-  tctx->SetCreated(user_id, total_threads_++, detached,
-                   parent_tid, arg);
+  tctx->SetCreated(user_id, total_threads_++, detached, parent_tid, arg);
   return tid;
 }
 
@@ -196,8 +206,8 @@ u32 ThreadRegistry::FindThread(FindThreadCallback cb, void *arg) {
   return kInvalidTid;
 }
 
-ThreadContextBase *
-ThreadRegistry::FindThreadContextLocked(FindThreadCallback cb, void *arg) {
+ThreadContextBase *ThreadRegistry::FindThreadContextLocked(
+    FindThreadCallback cb, void *arg) {
   CheckLocked();
   for (u32 tid = 0; tid < threads_.size(); tid++) {
     ThreadContextBase *tctx = threads_[tid];
@@ -210,7 +220,7 @@ ThreadRegistry::FindThreadContextLocked(FindThreadCallback cb, void *arg) {
 static bool FindThreadContextByOsIdCallback(ThreadContextBase *tctx,
                                             void *arg) {
   return (tctx->os_id == (uptr)arg && tctx->status != ThreadStatusInvalid &&
-      tctx->status != ThreadStatusDead);
+          tctx->status != ThreadStatusDead);
 }
 
 ThreadContextBase *ThreadRegistry::FindThreadContextByOsIDLocked(tid_t os_id) {

From a4916d200518ac077be93995af18bd80fcb89cc2 Mon Sep 17 00:00:00 2001
From: vporpo <vporpodas@google.com>
Date: Thu, 10 Oct 2024 12:42:28 -0700
Subject: [PATCH 069/177] [SandboxVec][DAG] Refactoring: Move MemPreds from
 DGNode to MemDGNode (#111897)

---
 .../SandboxVectorizer/DependencyGraph.h       | 36 +++++----
 .../SandboxVectorizer/DependencyGraph.cpp     | 20 ++---
 .../SandboxVectorizer/DependencyGraphTest.cpp | 73 +++++++++++--------
 3 files changed, 72 insertions(+), 57 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h
index eba6d7562e41de..da50e5326ea069 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h
@@ -1,4 +1,4 @@
-//===- DependencyGraph.h ----------------------------------*- C++ -*-===//
+//===- DependencyGraph.h ----------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -96,9 +96,6 @@ class DGNode {
   // TODO: Use a PointerIntPair for SubclassID and I.
   /// For isa/dyn_cast etc.
   DGNodeID SubclassID;
-  // TODO: Move MemPreds to MemDGNode.
-  /// Memory predecessors.
-  DenseSet<MemDGNode *> MemPreds;
 
   DGNode(Instruction *I, DGNodeID ID) : I(I), SubclassID(ID) {}
   friend class MemDGNode; // For constructor.
@@ -170,17 +167,6 @@ class DGNode {
   }
 
   Instruction *getInstruction() const { return I; }
-  void addMemPred(MemDGNode *PredN) { MemPreds.insert(PredN); }
-  /// \Returns all memory dependency predecessors.
-  iterator_range<DenseSet<MemDGNode *>::const_iterator> memPreds() const {
-    return make_range(MemPreds.begin(), MemPreds.end());
-  }
-  /// \Returns true if there is a memory dependency N->this.
-  bool hasMemPred(DGNode *N) const {
-    if (auto *MN = dyn_cast<MemDGNode>(N))
-      return MemPreds.count(MN);
-    return false;
-  }
 
 #ifndef NDEBUG
   virtual void print(raw_ostream &OS, bool PrintDeps = true) const;
@@ -198,6 +184,9 @@ class DGNode {
 class MemDGNode final : public DGNode {
   MemDGNode *PrevMemN = nullptr;
   MemDGNode *NextMemN = nullptr;
+  /// Memory predecessors.
+  DenseSet<MemDGNode *> MemPreds;
+  friend class PredIterator; // For MemPreds.
 
   void setNextNode(MemDGNode *N) { NextMemN = N; }
   void setPrevNode(MemDGNode *N) { PrevMemN = N; }
@@ -222,6 +211,21 @@ class MemDGNode final : public DGNode {
   MemDGNode *getPrevNode() const { return PrevMemN; }
   /// \Returns the next Mem DGNode in instruction order.
   MemDGNode *getNextNode() const { return NextMemN; }
+  /// Adds the mem dependency edge PredN->this.
+  void addMemPred(MemDGNode *PredN) { MemPreds.insert(PredN); }
+  /// \Returns true if there is a memory dependency N->this.
+  bool hasMemPred(DGNode *N) const {
+    if (auto *MN = dyn_cast<MemDGNode>(N))
+      return MemPreds.count(MN);
+    return false;
+  }
+  /// \Returns all memory dependency predecessors. Used by tests.
+  iterator_range<DenseSet<MemDGNode *>::const_iterator> memPreds() const {
+    return make_range(MemPreds.begin(), MemPreds.end());
+  }
+#ifndef NDEBUG
+  virtual void print(raw_ostream &OS, bool PrintDeps = true) const override;
+#endif // NDEBUG
 };
 
 /// Convenience builders for a MemDGNode interval.
@@ -266,7 +270,7 @@ class DependencyGraph {
 
   /// Go through all mem nodes in \p SrcScanRange and try to add dependencies to
   /// \p DstN.
-  void scanAndAddDeps(DGNode &DstN, const Interval<MemDGNode> &SrcScanRange);
+  void scanAndAddDeps(MemDGNode &DstN, const Interval<MemDGNode> &SrcScanRange);
 
 public:
   DependencyGraph(AAResults &AA)
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp
index 7aea466ed6d8db..70843812ff65bc 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp
@@ -23,7 +23,8 @@ PredIterator::value_type PredIterator::operator*() {
   // or a mem predecessor.
   if (OpIt != OpItE)
     return DAG->getNode(cast<Instruction>((Value *)*OpIt));
-  assert(MemIt != cast<MemDGNode>(N)->memPreds().end() &&
+  // It's a MemDGNode with OpIt == end, so we need to use MemIt.
+  assert(MemIt != cast<MemDGNode>(N)->MemPreds.end() &&
          "Cant' dereference end iterator!");
   return *MemIt;
 }
@@ -45,7 +46,8 @@ PredIterator &PredIterator::operator++() {
     OpIt = skipNonInstr(OpIt, OpItE);
     return *this;
   }
-  assert(MemIt != cast<MemDGNode>(N)->memPreds().end() && "Already at end!");
+  // It's a MemDGNode with OpIt == end, so we need to increment MemIt.
+  assert(MemIt != cast<MemDGNode>(N)->MemPreds.end() && "Already at end!");
   ++MemIt;
   return *this;
 }
@@ -57,10 +59,14 @@ bool PredIterator::operator==(const PredIterator &Other) const {
 }
 
 #ifndef NDEBUG
-void DGNode::print(raw_ostream &OS, bool PrintDeps) const {
+void DGNode::print(raw_ostream &OS, bool PrintDeps) const { I->dumpOS(OS); }
+void DGNode::dump() const {
+  print(dbgs());
+  dbgs() << "\n";
+}
+void MemDGNode::print(raw_ostream &OS, bool PrintDeps) const {
   I->dumpOS(OS);
   if (PrintDeps) {
-    OS << "\n";
     // Print memory preds.
     static constexpr const unsigned Indent = 4;
     for (auto *Pred : MemPreds) {
@@ -70,10 +76,6 @@ void DGNode::print(raw_ostream &OS, bool PrintDeps) const {
     }
   }
 }
-void DGNode::dump() const {
-  print(dbgs());
-  dbgs() << "\n";
-}
 #endif // NDEBUG
 
 Interval<MemDGNode>
@@ -179,7 +181,7 @@ bool DependencyGraph::hasDep(Instruction *SrcI, Instruction *DstI) {
   llvm_unreachable("Unknown DependencyType enum");
 }
 
-void DependencyGraph::scanAndAddDeps(DGNode &DstN,
+void DependencyGraph::scanAndAddDeps(MemDGNode &DstN,
                                      const Interval<MemDGNode> &SrcScanRange) {
   assert(isa<MemDGNode>(DstN) &&
          "DstN is the mem dep destination, so it must be mem");
diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp
index 6b3d9cc77c9955..5a9c9815ca42fa 100644
--- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp
@@ -50,10 +50,10 @@ struct DependencyGraphTest : public testing::Test {
     return *AA;
   }
   /// \Returns true if there is a dependency: SrcN->DstN.
-  bool dependency(sandboxir::DGNode *SrcN, sandboxir::DGNode *DstN) {
-    const auto &Preds = DstN->memPreds();
-    auto It = find(Preds, SrcN);
-    return It != Preds.end();
+  bool memDependency(sandboxir::DGNode *SrcN, sandboxir::DGNode *DstN) {
+    if (auto *MemDstN = dyn_cast<sandboxir::MemDGNode>(DstN))
+      return MemDstN->hasMemPred(SrcN);
+    return false;
   }
 };
 
@@ -230,9 +230,10 @@ define void @foo(ptr %ptr, i8 %v0, i8 %v1) {
   EXPECT_EQ(Span.top(), &*BB->begin());
   EXPECT_EQ(Span.bottom(), BB->getTerminator());
 
-  sandboxir::DGNode *N0 = DAG.getNode(S0);
-  sandboxir::DGNode *N1 = DAG.getNode(S1);
-  sandboxir::DGNode *N2 = DAG.getNode(Ret);
+  auto *N0 = cast<sandboxir::MemDGNode>(DAG.getNode(S0));
+  auto *N1 = cast<sandboxir::MemDGNode>(DAG.getNode(S1));
+  auto *N2 = DAG.getNode(Ret);
+
   // Check getInstruction().
   EXPECT_EQ(N0->getInstruction(), S0);
   EXPECT_EQ(N1->getInstruction(), S1);
@@ -247,7 +248,7 @@ define void @foo(ptr %ptr, i8 %v0, i8 %v1) {
   // Check memPreds().
   EXPECT_TRUE(N0->memPreds().empty());
   EXPECT_THAT(N1->memPreds(), testing::ElementsAre(N0));
-  EXPECT_TRUE(N2->memPreds().empty());
+  EXPECT_TRUE(N2->preds(DAG).empty());
 }
 
 TEST_F(DependencyGraphTest, Preds) {
@@ -399,12 +400,14 @@ define void @foo(ptr %ptr, i8 %v0, i8 %v1) {
   sandboxir::DependencyGraph DAG(getAA(*LLVMF));
   DAG.extend({&*BB->begin(), BB->getTerminator()});
   auto It = BB->begin();
-  auto *Store0N = DAG.getNode(cast<sandboxir::StoreInst>(&*It++));
-  auto *Store1N = DAG.getNode(cast<sandboxir::StoreInst>(&*It++));
+  auto *Store0N = cast<sandboxir::MemDGNode>(
+      DAG.getNode(cast<sandboxir::StoreInst>(&*It++)));
+  auto *Store1N = cast<sandboxir::MemDGNode>(
+      DAG.getNode(cast<sandboxir::StoreInst>(&*It++)));
   auto *RetN = DAG.getNode(cast<sandboxir::ReturnInst>(&*It++));
   EXPECT_TRUE(Store0N->memPreds().empty());
   EXPECT_THAT(Store1N->memPreds(), testing::ElementsAre(Store0N));
-  EXPECT_TRUE(RetN->memPreds().empty());
+  EXPECT_TRUE(RetN->preds(DAG).empty());
 }
 
 TEST_F(DependencyGraphTest, NonAliasingStores) {
@@ -422,13 +425,15 @@ define void @foo(ptr noalias %ptr0, ptr noalias %ptr1, i8 %v0, i8 %v1) {
   sandboxir::DependencyGraph DAG(getAA(*LLVMF));
   DAG.extend({&*BB->begin(), BB->getTerminator()});
   auto It = BB->begin();
-  auto *Store0N = DAG.getNode(cast<sandboxir::StoreInst>(&*It++));
-  auto *Store1N = DAG.getNode(cast<sandboxir::StoreInst>(&*It++));
+  auto *Store0N = cast<sandboxir::MemDGNode>(
+      DAG.getNode(cast<sandboxir::StoreInst>(&*It++)));
+  auto *Store1N = cast<sandboxir::MemDGNode>(
+      DAG.getNode(cast<sandboxir::StoreInst>(&*It++)));
   auto *RetN = DAG.getNode(cast<sandboxir::ReturnInst>(&*It++));
   // We expect no dependencies because the stores don't alias.
   EXPECT_TRUE(Store0N->memPreds().empty());
   EXPECT_TRUE(Store1N->memPreds().empty());
-  EXPECT_TRUE(RetN->memPreds().empty());
+  EXPECT_TRUE(RetN->preds(DAG).empty());
 }
 
 TEST_F(DependencyGraphTest, VolatileLoads) {
@@ -446,12 +451,14 @@ define void @foo(ptr noalias %ptr0, ptr noalias %ptr1) {
   sandboxir::DependencyGraph DAG(getAA(*LLVMF));
   DAG.extend({&*BB->begin(), BB->getTerminator()});
   auto It = BB->begin();
-  auto *Ld0N = DAG.getNode(cast<sandboxir::LoadInst>(&*It++));
-  auto *Ld1N = DAG.getNode(cast<sandboxir::LoadInst>(&*It++));
+  auto *Ld0N = cast<sandboxir::MemDGNode>(
+      DAG.getNode(cast<sandboxir::LoadInst>(&*It++)));
+  auto *Ld1N = cast<sandboxir::MemDGNode>(
+      DAG.getNode(cast<sandboxir::LoadInst>(&*It++)));
   auto *RetN = DAG.getNode(cast<sandboxir::ReturnInst>(&*It++));
   EXPECT_TRUE(Ld0N->memPreds().empty());
   EXPECT_THAT(Ld1N->memPreds(), testing::ElementsAre(Ld0N));
-  EXPECT_TRUE(RetN->memPreds().empty());
+  EXPECT_TRUE(RetN->preds(DAG).empty());
 }
 
 TEST_F(DependencyGraphTest, VolatileSotres) {
@@ -469,12 +476,14 @@ define void @foo(ptr noalias %ptr0, ptr noalias %ptr1, i8 %v) {
   sandboxir::DependencyGraph DAG(getAA(*LLVMF));
   DAG.extend({&*BB->begin(), BB->getTerminator()});
   auto It = BB->begin();
-  auto *Store0N = DAG.getNode(cast<sandboxir::StoreInst>(&*It++));
-  auto *Store1N = DAG.getNode(cast<sandboxir::StoreInst>(&*It++));
+  auto *Store0N = cast<sandboxir::MemDGNode>(
+      DAG.getNode(cast<sandboxir::StoreInst>(&*It++)));
+  auto *Store1N = cast<sandboxir::MemDGNode>(
+      DAG.getNode(cast<sandboxir::StoreInst>(&*It++)));
   auto *RetN = DAG.getNode(cast<sandboxir::ReturnInst>(&*It++));
   EXPECT_TRUE(Store0N->memPreds().empty());
   EXPECT_THAT(Store1N->memPreds(), testing::ElementsAre(Store0N));
-  EXPECT_TRUE(RetN->memPreds().empty());
+  EXPECT_TRUE(RetN->preds(DAG).empty());
 }
 
 TEST_F(DependencyGraphTest, Call) {
@@ -498,12 +507,12 @@ define void @foo(float %v1, float %v2) {
   DAG.extend({&*BB->begin(), BB->getTerminator()->getPrevNode()});
 
   auto It = BB->begin();
-  auto *Call1N = DAG.getNode(&*It++);
+  auto *Call1N = cast<sandboxir::MemDGNode>(DAG.getNode(&*It++));
   auto *AddN = DAG.getNode(&*It++);
-  auto *Call2N = DAG.getNode(&*It++);
+  auto *Call2N = cast<sandboxir::MemDGNode>(DAG.getNode(&*It++));
 
   EXPECT_THAT(Call1N->memPreds(), testing::ElementsAre());
-  EXPECT_THAT(AddN->memPreds(), testing::ElementsAre());
+  EXPECT_THAT(AddN->preds(DAG), testing::ElementsAre());
   EXPECT_THAT(Call2N->memPreds(), testing::ElementsAre(Call1N));
 }
 
@@ -534,8 +543,8 @@ define void @foo() {
   auto *AllocaN = DAG.getNode(&*It++);
   auto *StackRestoreN = DAG.getNode(&*It++);
 
-  EXPECT_TRUE(dependency(AllocaN, StackRestoreN));
-  EXPECT_TRUE(dependency(StackSaveN, AllocaN));
+  EXPECT_TRUE(memDependency(AllocaN, StackRestoreN));
+  EXPECT_TRUE(memDependency(StackSaveN, AllocaN));
 }
 
 // Checks that stacksave and stackrestore depend on other mem instrs.
@@ -567,9 +576,9 @@ define void @foo(i8 %v0, i8 %v1, ptr %ptr) {
   auto *StackRestoreN = DAG.getNode(&*It++);
   auto *Store1N = DAG.getNode(&*It++);
 
-  EXPECT_TRUE(dependency(Store0N, StackSaveN));
-  EXPECT_TRUE(dependency(StackSaveN, StackRestoreN));
-  EXPECT_TRUE(dependency(StackRestoreN, Store1N));
+  EXPECT_TRUE(memDependency(Store0N, StackSaveN));
+  EXPECT_TRUE(memDependency(StackSaveN, StackRestoreN));
+  EXPECT_TRUE(memDependency(StackRestoreN, Store1N));
 }
 
 // Make sure there is a dependency between a stackrestore and an alloca.
@@ -596,7 +605,7 @@ define void @foo(ptr %ptr) {
   auto *StackRestoreN = DAG.getNode(&*It++);
   auto *AllocaN = DAG.getNode(&*It++);
 
-  EXPECT_TRUE(dependency(StackRestoreN, AllocaN));
+  EXPECT_TRUE(memDependency(StackRestoreN, AllocaN));
 }
 
 // Make sure there is a dependency between the alloca and stacksave
@@ -623,7 +632,7 @@ define void @foo(ptr %ptr) {
   auto *AllocaN = DAG.getNode(&*It++);
   auto *StackSaveN = DAG.getNode(&*It++);
 
-  EXPECT_TRUE(dependency(AllocaN, StackSaveN));
+  EXPECT_TRUE(memDependency(AllocaN, StackSaveN));
 }
 
 // A non-InAlloca in a stacksave-stackrestore region does not need extra
@@ -655,6 +664,6 @@ define void @foo() {
   auto *AllocaN = DAG.getNode(&*It++);
   auto *StackRestoreN = DAG.getNode(&*It++);
 
-  EXPECT_FALSE(dependency(StackSaveN, AllocaN));
-  EXPECT_FALSE(dependency(AllocaN, StackRestoreN));
+  EXPECT_FALSE(memDependency(StackSaveN, AllocaN));
+  EXPECT_FALSE(memDependency(AllocaN, StackRestoreN));
 }

From d832a1c744fddad93ec4d8d2739c2a49a3623e02 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli@nvidia.com>
Date: Thu, 10 Oct 2024 12:57:43 -0700
Subject: [PATCH 070/177] [NVPTX] Only run LowerUnreachable when necessary
 (#109868)

Before CUDA 12.3 `ptxas` did not recognize that the trap instruction
terminates a basic block. Instead, it would assume that control flow
continued to the next instruction. The next instruction could be in the
block that's lexically below it. This would lead to phantom CFG edges
being created within ptxas.

[NVPTX: Lower unreachable to exit to allow ptxas to accurately
reconstruct the
CFG.](https://github.com/llvm/llvm-project/commit/1ee4d880e8760256c606fe55b7af85a4f70d006d)
added the LowerUnreachable pass to NVPTX to work around this. Several
other WAR patches followed.

This bug in `ptxas` was fixed in CUDA 12.3 and is thus impossible to
encounter when targeting PTX ISA v8.3+

This commit reverts the WARs for the `ptxas` bug when targeting PTX ISA
v8.3+

CC @maleadt
---
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td      |   5 +-
 llvm/lib/Target/NVPTX/NVPTXSubtarget.h       |   8 ++
 llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp |  10 +-
 llvm/test/CodeGen/NVPTX/unreachable.ll       | 103 +++++++++++++++----
 4 files changed, 100 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 8f4eddb5142740..8b34ce4f1001c1 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -139,6 +139,8 @@ def hasVote : Predicate<"Subtarget->hasVote()">;
 def hasDouble : Predicate<"Subtarget->hasDouble()">;
 def hasLDG : Predicate<"Subtarget->hasLDG()">;
 def hasLDU : Predicate<"Subtarget->hasLDU()">;
+def hasPTXASUnreachableBug : Predicate<"Subtarget->hasPTXASUnreachableBug()">;
+def noPTXASUnreachableBug : Predicate<"!Subtarget->hasPTXASUnreachableBug()">;
 
 def doF32FTZ : Predicate<"useF32FTZ()">;
 def doNoF32FTZ : Predicate<"!useF32FTZ()">;
@@ -3736,9 +3738,10 @@ def Callseq_End :
             [(callseq_end timm:$amt1, timm:$amt2)]>;
 
 // trap instruction
+def trapinst : NVPTXInst<(outs), (ins), "trap;", [(trap)]>, Requires<[noPTXASUnreachableBug]>;
 // Emit an `exit` as well to convey to ptxas that `trap` exits the CFG.
 // This won't be necessary in a future version of ptxas.
-def trapinst : NVPTXInst<(outs), (ins), "trap; exit;", [(trap)]>;
+def trapexitinst : NVPTXInst<(outs), (ins), "trap; exit;", [(trap)]>, Requires<[hasPTXASUnreachableBug]>;
 // brkpt instruction
 def debugtrapinst : NVPTXInst<(outs), (ins), "brkpt;", [(debugtrap)]>;
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 8b9059bd60cbd4..e785bbf830da62 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -95,6 +95,14 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
   bool hasDotInstructions() const {
     return SmVersion >= 61 && PTXVersion >= 50;
   }
+  // Prior to CUDA 12.3 ptxas did not recognize that the trap instruction
+  // terminates a basic block. Instead, it would assume that control flow
+  // continued to the next instruction. The next instruction could be in the
+  // block that's lexically below it. This would lead to a phantom CFG edges
+  // being created within ptxas. This issue was fixed in CUDA 12.3. Thus, when
+  // PTX ISA versions 8.3+ we can confidently say that the bug will not be
+  // present.
+  bool hasPTXASUnreachableBug() const { return PTXVersion < 83; }
   bool hasCvtaParam() const { return SmVersion >= 70 && PTXVersion >= 77; }
   unsigned int getFullSmVersion() const { return FullSmVersion; }
   unsigned int getSmVersion() const { return getFullSmVersion() / 10; }
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 8e6e4395efb559..2eb8b17f1b0f40 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -367,9 +367,13 @@ void NVPTXPassConfig::addIRPasses() {
     addPass(createSROAPass());
   }
 
-  const auto &Options = getNVPTXTargetMachine().Options;
-  addPass(createNVPTXLowerUnreachablePass(Options.TrapUnreachable,
-                                          Options.NoTrapAfterNoreturn));
+  if (ST.hasPTXASUnreachableBug()) {
+    // Run LowerUnreachable to WAR a ptxas bug. See the commit description of
+    // 1ee4d880e8760256c606fe55b7af85a4f70d006d for more details.
+    const auto &Options = getNVPTXTargetMachine().Options;
+    addPass(createNVPTXLowerUnreachablePass(Options.TrapUnreachable,
+                                            Options.NoTrapAfterNoreturn));
+  }
 }
 
 bool NVPTXPassConfig::addInstSelector() {
diff --git a/llvm/test/CodeGen/NVPTX/unreachable.ll b/llvm/test/CodeGen/NVPTX/unreachable.ll
index f9118900cb7372..6bd583c8d50d8a 100644
--- a/llvm/test/CodeGen/NVPTX/unreachable.ll
+++ b/llvm/test/CodeGen/NVPTX/unreachable.ll
@@ -1,48 +1,107 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs -trap-unreachable=false \
-; RUN:     | FileCheck %s  --check-prefix=CHECK --check-prefix=CHECK-NOTRAP
+; RUN:     | FileCheck %s --check-prefixes=CHECK,NO-TRAP-UNREACHABLE
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs -trap-unreachable=false \
-; RUN:     | FileCheck %s  --check-prefix=CHECK --check-prefix=CHECK-NOTRAP
+; RUN:     | FileCheck %s --check-prefixes=CHECK,NO-TRAP-UNREACHABLE
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs -trap-unreachable -no-trap-after-noreturn \
-; RUN:     | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NOTRAP
+; RUN:     | FileCheck %s --check-prefixes=CHECK,NO-TRAP-AFTER-NORETURN
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs -trap-unreachable -no-trap-after-noreturn \
-; RUN:     | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NOTRAP
+; RUN:     | FileCheck %s --check-prefixes=CHECK,NO-TRAP-AFTER-NORETURN
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs -trap-unreachable -no-trap-after-noreturn=false \
-; RUN:     | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-TRAP
+; RUN:     | FileCheck %s --check-prefixes=CHECK,TRAP
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs -trap-unreachable -no-trap-after-noreturn=false \
-; RUN:     | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-TRAP
+; RUN:     | FileCheck %s --check-prefixes=CHECK,TRAP
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs -trap-unreachable -mattr=+ptx83 \
+; RUN:     | FileCheck %s --check-prefixes=BUG-FIXED
 ; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
 
-; CHECK: .extern .func throw
+target triple = "nvptx-unknown-cuda"
+
 declare void @throw() #0
 declare void @llvm.trap() #0
 
-; CHECK-LABEL: .entry kernel_func
 define void @kernel_func() {
-; CHECK: call.uni
-; CHECK: throw,
+; NO-TRAP-UNREACHABLE-LABEL: kernel_func(
+; NO-TRAP-UNREACHABLE:       {
+; NO-TRAP-UNREACHABLE-EMPTY:
+; NO-TRAP-UNREACHABLE-EMPTY:
+; NO-TRAP-UNREACHABLE-NEXT:  // %bb.0:
+; NO-TRAP-UNREACHABLE-NEXT:    { // callseq 0, 0
+; NO-TRAP-UNREACHABLE-NEXT:    call.uni
+; NO-TRAP-UNREACHABLE-NEXT:    throw,
+; NO-TRAP-UNREACHABLE-NEXT:    (
+; NO-TRAP-UNREACHABLE-NEXT:    );
+; NO-TRAP-UNREACHABLE-NEXT:    } // callseq 0
+; NO-TRAP-UNREACHABLE-NEXT:    // begin inline asm
+; NO-TRAP-UNREACHABLE-NEXT:    exit;
+; NO-TRAP-UNREACHABLE-NEXT:    // end inline asm
+;
+; NO-TRAP-AFTER-NORETURN-LABEL: kernel_func(
+; NO-TRAP-AFTER-NORETURN:       {
+; NO-TRAP-AFTER-NORETURN-EMPTY:
+; NO-TRAP-AFTER-NORETURN-EMPTY:
+; NO-TRAP-AFTER-NORETURN-NEXT:  // %bb.0:
+; NO-TRAP-AFTER-NORETURN-NEXT:    { // callseq 0, 0
+; NO-TRAP-AFTER-NORETURN-NEXT:    call.uni
+; NO-TRAP-AFTER-NORETURN-NEXT:    throw,
+; NO-TRAP-AFTER-NORETURN-NEXT:    (
+; NO-TRAP-AFTER-NORETURN-NEXT:    );
+; NO-TRAP-AFTER-NORETURN-NEXT:    } // callseq 0
+; NO-TRAP-AFTER-NORETURN-NEXT:    // begin inline asm
+; NO-TRAP-AFTER-NORETURN-NEXT:    exit;
+; NO-TRAP-AFTER-NORETURN-NEXT:    // end inline asm
+; NO-TRAP-AFTER-NORETURN-NEXT:    trap; exit;
+;
+; TRAP-LABEL: kernel_func(
+; TRAP:       {
+; TRAP-EMPTY:
+; TRAP-EMPTY:
+; TRAP-NEXT:  // %bb.0:
+; TRAP-NEXT:    { // callseq 0, 0
+; TRAP-NEXT:    call.uni
+; TRAP-NEXT:    throw,
+; TRAP-NEXT:    (
+; TRAP-NEXT:    );
+; TRAP-NEXT:    } // callseq 0
+; TRAP-NEXT:    trap; exit;
+;
+; BUG-FIXED-LABEL: kernel_func(
+; BUG-FIXED:       {
+; BUG-FIXED-EMPTY:
+; BUG-FIXED-EMPTY:
+; BUG-FIXED-NEXT:  // %bb.0:
+; BUG-FIXED-NEXT:    { // callseq 0, 0
+; BUG-FIXED-NEXT:    call.uni
+; BUG-FIXED-NEXT:    throw,
+; BUG-FIXED-NEXT:    (
+; BUG-FIXED-NEXT:    );
+; BUG-FIXED-NEXT:    } // callseq 0
+; BUG-FIXED-NEXT:    trap;
   call void @throw()
-; CHECK-TRAP-NOT: exit;
-; CHECK-TRAP: trap;
-; CHECK-NOTRAP-NOT: trap;
-; CHECK: exit;
   unreachable
 }
 
-; CHECK-LABEL: kernel_func_2
 define void @kernel_func_2() {
-; CHECK: trap; exit;
+; CHECK-LABEL: kernel_func_2(
+; CHECK:       {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    trap; exit;
+;
+; BUG-FIXED-LABEL: kernel_func_2(
+; BUG-FIXED:       {
+; BUG-FIXED-EMPTY:
+; BUG-FIXED-EMPTY:
+; BUG-FIXED-NEXT:  // %bb.0:
+; BUG-FIXED-NEXT:    trap;
   call void @llvm.trap()
-
-;; Make sure we avoid emitting two trap instructions.
-; CHECK-NOT: trap;
-; CHECK-NOT: exit;
+; Make sure we avoid emitting two trap instructions.
   unreachable
 }
 
 attributes #0 = { noreturn }
 
-
 !nvvm.annotations = !{!1}
-
 !1 = !{ptr @kernel_func, !"kernel", i32 1}

From 29e192a0bfbc75fa66498d3b1c1d1329009f1dd2 Mon Sep 17 00:00:00 2001
From: Tyler Nowicki <tyler.nowicki@amd.com>
Date: Thu, 10 Oct 2024 15:59:24 -0400
Subject: [PATCH 071/177] [Coroutines] Documentation for custom ABIs (#111781)

Update the llvm/docs/Coroutines.rst docs to include a full description
of Custom ABI objects. This documentation describes the how ABI objects
allow users (plugin libraries) to create custom ABI objects for their
needs.
---
 llvm/docs/Coroutines.rst | 90 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 90 insertions(+)

diff --git a/llvm/docs/Coroutines.rst b/llvm/docs/Coroutines.rst
index 5679aefcb421d8..8794df65504fa2 100644
--- a/llvm/docs/Coroutines.rst
+++ b/llvm/docs/Coroutines.rst
@@ -312,6 +312,7 @@ lowered to a constant representing the size required for the coroutine frame.
 The `coro.begin`_ intrinsic initializes the coroutine frame and returns the
 coroutine handle. The second parameter of `coro.begin` is given a block of memory
 to be used if the coroutine frame needs to be allocated dynamically.
+
 The `coro.id`_ intrinsic serves as coroutine identity useful in cases when the
 `coro.begin`_ intrinsic get duplicated by optimization passes such as
 jump-threading.
@@ -749,6 +750,65 @@ and python iterator `__next__` would look like:
     return *(int*)coro.promise(hdl, 4, false);
   }
 
+Custom ABIs and Plugin Libraries
+--------------------------------
+
+Plugin libraries can extend coroutine lowering enabling a wide variety of users
+to utilize the coroutine transformation passes. An existing coroutine lowering
+is extended by:
+
+#. defining custom ABIs that inherit from the existing ABIs,
+#. give a list of generators for the custom ABIs when constructing the `CoroSplit`_ pass, and
+#. use `coro.begin.custom.abi`_ in place of `coro.begin`_ that has an additional parameter for the index of the generator/ABI to be used for the coroutine.
+
+A custom ABI overriding the SwitchABI's materialization looks like:
+
+.. code-block:: c++
+
+  class CustomSwitchABI : public coro::SwitchABI {
+  public:
+    CustomSwitchABI(Function &F, coro::Shape &S)
+      : coro::SwitchABI(F, S, ExtraMaterializable) {}
+  };
+
+Giving a list of custom ABI generators while constructing the `CoroSplit`
+pass looks like:
+
+.. code-block:: c++
+
+  CoroSplitPass::BaseABITy GenCustomABI = [](Function &F, coro::Shape &S) {
+    return std::make_unique<CustomSwitchABI>(F, S);
+  };
+
+  CGSCCPassManager CGPM;
+  CGPM.addPass(CoroSplitPass({GenCustomABI}));
+
+The LLVM IR for a coroutine using a Coroutine with a custom ABI looks like:
+
+.. code-block:: llvm
+
+  define ptr @f(i32 %n) presplitcoroutine_custom_abi {
+  entry:
+    %id = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null)
+    %size = call i32 @llvm.coro.size.i32()
+    %alloc = call ptr @malloc(i32 %size)
+    %hdl = call noalias ptr @llvm.coro.begin.custom.abi(token %id, ptr %alloc, i32 0)
+    br label %loop
+  loop:
+    %n.val = phi i32 [ %n, %entry ], [ %inc, %loop ]
+    %inc = add nsw i32 %n.val, 1
+    call void @print(i32 %n.val)
+    %0 = call i8 @llvm.coro.suspend(token none, i1 false)
+    switch i8 %0, label %suspend [i8 0, label %loop
+                                  i8 1, label %cleanup]
+  cleanup:
+    %mem = call ptr @llvm.coro.free(token %id, ptr %hdl)
+    call void @free(ptr %mem)
+    br label %suspend
+  suspend:
+    %unused = call i1 @llvm.coro.end(ptr %hdl, i1 false, token none)
+    ret ptr %hdl
+  }
 
 Intrinsics
 ==========
@@ -1007,6 +1067,36 @@ with small positive and negative offsets).
 
 A frontend should emit exactly one `coro.begin` intrinsic per coroutine.
 
+.. _coro.begin.custom.abi:
+
+'llvm.coro.begin.custom.abi' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+::
+
+  declare ptr @llvm.coro.begin.custom.abi(token <id>, ptr <mem>, i32)
+
+Overview:
+"""""""""
+
+The '``llvm.coro.begin.custom.abi``' intrinsic is used in place of the
+`coro.begin` intrinsic that has an additional parameter to specify the custom
+ABI for the coroutine. The return is identical to that of the `coro.begin`
+intrinsic.
+
+Arguments:
+""""""""""
+
+The first and second arguments are identical to those of the `coro.begin`
+intrinsic.
+
+The third argument is an i32 index of the generator list given to the
+`CoroSplit` pass specifying the custom ABI generator lor this coroutine.
+
+Semantics:
+""""""""""
+
+The semantics are identical to those of the `coro.begin` intrinsic.
+
 .. _coro.free:
 
 'llvm.coro.free' Intrinsic

From d5e1de6da96c1ab3b8cae68447e8ed3696a7006e Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <robert@ocallahan.org>
Date: Fri, 11 Oct 2024 09:01:47 +1300
Subject: [PATCH 072/177] [lldb] Implement basic support for reverse-continue
 (#99736)

This commit only adds support for the
`SBProcess::ReverseContinue()` API. A user-accessible command for this
will follow in a later commit.

This feature depends on a gdbserver implementation (e.g. `rr`) providing
support for the `bc` and `bs` packets. `lldb-server` does not support
those packets, and there is no plan to change that. So, for testing
purposes, `lldbreverse.py` wraps `lldb-server` with a Python
implementation of *very limited* record-and-replay functionality for use
by *tests only*.

The majority of this PR is test infrastructure (about 700 of the 950
lines added).
---
 lldb/include/lldb/API/SBProcess.h             |   1 +
 lldb/include/lldb/Target/Process.h            |  21 +-
 lldb/include/lldb/Target/StopInfo.h           |   6 +
 lldb/include/lldb/lldb-enumerations.h         |   6 +
 .../Python/lldbsuite/test/gdbclientutils.py   |   5 +-
 .../Python/lldbsuite/test/lldbgdbproxy.py     | 175 ++++++++
 .../Python/lldbsuite/test/lldbreverse.py      | 418 ++++++++++++++++++
 .../Python/lldbsuite/test/lldbtest.py         |   2 +
 lldb/source/API/SBProcess.cpp                 |   8 +-
 lldb/source/API/SBThread.cpp                  |   2 +
 .../source/Interpreter/CommandInterpreter.cpp |   3 +-
 .../Process/Linux/NativeThreadLinux.cpp       |   3 +
 .../Process/MacOSX-Kernel/ProcessKDP.cpp      |   9 +-
 .../Process/MacOSX-Kernel/ProcessKDP.h        |   2 +-
 .../Process/Windows/Common/ProcessWindows.cpp |   8 +-
 .../Process/Windows/Common/ProcessWindows.h   |   2 +-
 .../GDBRemoteCommunicationClient.cpp          |  22 +
 .../gdb-remote/GDBRemoteCommunicationClient.h |   6 +
 .../GDBRemoteCommunicationServerLLGS.cpp      |   1 +
 .../Process/gdb-remote/ProcessGDBRemote.cpp   |  77 +++-
 .../Process/gdb-remote/ProcessGDBRemote.h     |   2 +-
 .../Process/scripted/ScriptedProcess.cpp      |   9 +-
 .../Process/scripted/ScriptedProcess.h        |   2 +-
 lldb/source/Target/Process.cpp                |  29 +-
 lldb/source/Target/StopInfo.cpp               |  29 ++
 lldb/source/Target/Thread.cpp                 |   8 +-
 .../reverse-execution/Makefile                |   3 +
 .../TestReverseContinueBreakpoints.py         | 115 +++++
 .../TestReverseContinueNotSupported.py        |  30 ++
 .../functionalities/reverse-execution/main.c  |  14 +
 lldb/tools/lldb-dap/JSONUtils.cpp             |   3 +
 lldb/tools/lldb-dap/LLDBUtils.cpp             |   1 +
 32 files changed, 978 insertions(+), 44 deletions(-)
 create mode 100644 lldb/packages/Python/lldbsuite/test/lldbgdbproxy.py
 create mode 100644 lldb/packages/Python/lldbsuite/test/lldbreverse.py
 create mode 100644 lldb/test/API/functionalities/reverse-execution/Makefile
 create mode 100644 lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py
 create mode 100644 lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py
 create mode 100644 lldb/test/API/functionalities/reverse-execution/main.c

diff --git a/lldb/include/lldb/API/SBProcess.h b/lldb/include/lldb/API/SBProcess.h
index 1624e02070b1b2..8b8ed830b54cc0 100644
--- a/lldb/include/lldb/API/SBProcess.h
+++ b/lldb/include/lldb/API/SBProcess.h
@@ -159,6 +159,7 @@ class LLDB_API SBProcess {
   lldb::SBError Destroy();
 
   lldb::SBError Continue();
+  lldb::SBError Continue(RunDirection direction);
 
   lldb::SBError Stop();
 
diff --git a/lldb/include/lldb/Target/Process.h b/lldb/include/lldb/Target/Process.h
index b8c53a474ba6b9..fe7fbc50fd5770 100644
--- a/lldb/include/lldb/Target/Process.h
+++ b/lldb/include/lldb/Target/Process.h
@@ -857,10 +857,10 @@ class Process : public std::enable_shared_from_this<Process>,
   /// \see Thread:Resume()
   /// \see Thread:Step()
   /// \see Thread:Suspend()
-  Status Resume();
+  Status Resume(lldb::RunDirection direction = lldb::eRunForward);
 
   /// Resume a process, and wait for it to stop.
-  Status ResumeSynchronous(Stream *stream);
+  Status ResumeSynchronous(Stream *stream, lldb::RunDirection direction = lldb::eRunForward);
 
   /// Halts a running process.
   ///
@@ -1104,9 +1104,14 @@ class Process : public std::enable_shared_from_this<Process>,
   /// \see Thread:Resume()
   /// \see Thread:Step()
   /// \see Thread:Suspend()
-  virtual Status DoResume() {
-    return Status::FromErrorStringWithFormatv(
-        "error: {0} does not support resuming processes", GetPluginName());
+  virtual Status DoResume(lldb::RunDirection direction) {
+    if (direction == lldb::RunDirection::eRunForward) {
+      return Status::FromErrorStringWithFormatv(
+          "error: {0} does not support resuming processes", GetPluginName());
+    } else {
+      return Status::FromErrorStringWithFormatv(
+          "error: {0} does not support reverse execution of processes", GetPluginName());
+    }
   }
 
   /// Called after resuming a process.
@@ -2332,6 +2337,8 @@ class Process : public std::enable_shared_from_this<Process>,
 
   bool IsRunning() const;
 
+  lldb::RunDirection GetLastRunDirection() { return m_last_run_direction; }
+
   DynamicCheckerFunctions *GetDynamicCheckers() {
     return m_dynamic_checkers_up.get();
   }
@@ -2851,7 +2858,7 @@ void PruneThreadPlans();
   ///
   /// \return
   ///     An Status object describing the success or failure of the resume.
-  Status PrivateResume();
+  Status PrivateResume(lldb::RunDirection direction = lldb::eRunForward);
 
   // Called internally
   void CompleteAttach();
@@ -3127,6 +3134,8 @@ void PruneThreadPlans();
                            // m_currently_handling_do_on_removals are true,
                            // Resume will only request a resume, using this
                            // flag to check.
+  // The direction of execution from the last time this process was resumed.
+  lldb::RunDirection m_last_run_direction;
 
   lldb::tid_t m_interrupt_tid; /// The tid of the thread that issued the async
                                /// interrupt, used by thread plan timeout. It
diff --git a/lldb/include/lldb/Target/StopInfo.h b/lldb/include/lldb/Target/StopInfo.h
index fae90364deaf0a..072f71f6b1122f 100644
--- a/lldb/include/lldb/Target/StopInfo.h
+++ b/lldb/include/lldb/Target/StopInfo.h
@@ -142,6 +142,12 @@ class StopInfo : public std::enable_shared_from_this<StopInfo> {
   static lldb::StopInfoSP
   CreateStopReasonProcessorTrace(Thread &thread, const char *description);
 
+  // This creates a StopInfo indicating that execution stopped because
+  // it was replaying some recorded execution history, and execution reached
+  // the end of that recorded history.
+  static lldb::StopInfoSP
+  CreateStopReasonHistoryBoundary(Thread &thread, const char *description);
+
   static lldb::StopInfoSP CreateStopReasonFork(Thread &thread,
                                                lldb::pid_t child_pid,
                                                lldb::tid_t child_tid);
diff --git a/lldb/include/lldb/lldb-enumerations.h b/lldb/include/lldb/lldb-enumerations.h
index 938f6e3abe8f2a..232d1dfdb5c9d0 100644
--- a/lldb/include/lldb/lldb-enumerations.h
+++ b/lldb/include/lldb/lldb-enumerations.h
@@ -135,6 +135,9 @@ FLAGS_ENUM(LaunchFlags){
 /// Thread Run Modes.
 enum RunMode { eOnlyThisThread, eAllThreads, eOnlyDuringStepping };
 
+/// Execution directions
+enum RunDirection { eRunForward, eRunReverse };
+
 /// Byte ordering definitions.
 enum ByteOrder {
   eByteOrderInvalid = 0,
@@ -254,6 +257,9 @@ enum StopReason {
   eStopReasonVFork,
   eStopReasonVForkDone,
   eStopReasonInterrupt, ///< Thread requested interrupt
+  // Indicates that execution stopped because the debugger backend relies
+  // on recorded data and we reached the end of that data.
+  eStopReasonHistoryBoundary,
 };
 
 /// Command Return Status Types.
diff --git a/lldb/packages/Python/lldbsuite/test/gdbclientutils.py b/lldb/packages/Python/lldbsuite/test/gdbclientutils.py
index 1784487323ad6b..732d6171320680 100644
--- a/lldb/packages/Python/lldbsuite/test/gdbclientutils.py
+++ b/lldb/packages/Python/lldbsuite/test/gdbclientutils.py
@@ -510,8 +510,9 @@ def start(self):
         self._thread.start()
 
     def stop(self):
-        self._thread.join()
-        self._thread = None
+        if self._thread is not None:
+            self._thread.join()
+            self._thread = None
 
     def get_connect_address(self):
         return self._socket.get_connect_address()
diff --git a/lldb/packages/Python/lldbsuite/test/lldbgdbproxy.py b/lldb/packages/Python/lldbsuite/test/lldbgdbproxy.py
new file mode 100644
index 00000000000000..2a9592bf4545a4
--- /dev/null
+++ b/lldb/packages/Python/lldbsuite/test/lldbgdbproxy.py
@@ -0,0 +1,175 @@
+import logging
+import os
+import os.path
+import random
+
+import lldb
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test.gdbclientutils import *
+import lldbgdbserverutils
+from lldbsuite.support import seven
+
+
+class GDBProxyTestBase(TestBase):
+    """
+    Base class for gdbserver proxy tests.
+
+    This class will setup and start a mock GDB server for the test to use.
+    It pases through requests to a regular lldb-server/debugserver and
+    forwards replies back to the LLDB under test.
+    """
+
+    """The gdbserver that we implement."""
+    server = None
+    """The inner lldb-server/debugserver process that we proxy requests into."""
+    monitor_server = None
+    monitor_sock = None
+
+    server_socket_class = TCPServerSocket
+
+    DEFAULT_TIMEOUT = 20 * (10 if ("ASAN_OPTIONS" in os.environ) else 1)
+
+    _verbose_log_handler = None
+    _log_formatter = logging.Formatter(fmt="%(asctime)-15s %(levelname)-8s %(message)s")
+
+    def setUpBaseLogging(self):
+        self.logger = logging.getLogger(__name__)
+
+        if len(self.logger.handlers) > 0:
+            return  # We have set up this handler already
+
+        self.logger.propagate = False
+        self.logger.setLevel(logging.DEBUG)
+
+        # log all warnings to stderr
+        handler = logging.StreamHandler()
+        handler.setLevel(logging.WARNING)
+        handler.setFormatter(self._log_formatter)
+        self.logger.addHandler(handler)
+
+    def setUp(self):
+        TestBase.setUp(self)
+
+        self.setUpBaseLogging()
+
+        if self.isVerboseLoggingRequested():
+            # If requested, full logs go to a log file
+            log_file_name = self.getLogBasenameForCurrentTest() + "-proxy.log"
+            self._verbose_log_handler = logging.FileHandler(
+               log_file_name
+            )
+            self._verbose_log_handler.setFormatter(self._log_formatter)
+            self._verbose_log_handler.setLevel(logging.DEBUG)
+            self.logger.addHandler(self._verbose_log_handler)
+
+        lldb_server_exe = lldbgdbserverutils.get_lldb_server_exe()
+        if lldb_server_exe is None:
+            self.debug_monitor_exe = lldbgdbserverutils.get_debugserver_exe()
+            self.assertTrue(self.debug_monitor_exe is not None)
+            self.debug_monitor_extra_args = []
+        else:
+            self.debug_monitor_exe = lldb_server_exe
+            self.debug_monitor_extra_args = ["gdbserver"]
+
+        self.server = MockGDBServer(self.server_socket_class())
+        self.server.responder = self
+
+    def tearDown(self):
+        # TestBase.tearDown will kill the process, but we need to kill it early
+        # so its client connection closes and we can stop the server before
+        # finally calling the base tearDown.
+        if self.process() is not None:
+            self.process().Kill()
+        self.server.stop()
+
+        self.logger.removeHandler(self._verbose_log_handler)
+        self._verbose_log_handler = None
+
+        TestBase.tearDown(self)
+
+    def isVerboseLoggingRequested(self):
+        # We will report our detailed logs if the user requested that the "gdb-remote" channel is
+        # logged.
+        return any(("gdb-remote" in channel) for channel in lldbtest_config.channels)
+
+    def connect(self, target):
+        """
+        Create a process by connecting to the mock GDB server.
+        """
+        self.prep_debug_monitor_and_inferior()
+        self.server.start()
+
+        listener = self.dbg.GetListener()
+        error = lldb.SBError()
+        process = target.ConnectRemote(
+            listener, self.server.get_connect_url(), "gdb-remote", error
+        )
+        self.assertTrue(error.Success(), error.description)
+        self.assertTrue(process, PROCESS_IS_VALID)
+        return process
+
+    def get_next_port(self):
+        return 12000 + random.randint(0, 3999)
+
+    def prep_debug_monitor_and_inferior(self):
+        inferior_exe_path = self.getBuildArtifact("a.out")
+        self.connect_to_debug_monitor([inferior_exe_path])
+        self.assertIsNotNone(self.monitor_server)
+        self.initial_handshake()
+
+    def initial_handshake(self):
+        self.monitor_server.send_packet(seven.bitcast_to_bytes("+"))
+        reply = seven.bitcast_to_string(self.monitor_server.get_normal_packet())
+        self.assertEqual(reply, "+")
+        self.monitor_server.send_packet(seven.bitcast_to_bytes("QStartNoAckMode"))
+        reply = seven.bitcast_to_string(self.monitor_server.get_normal_packet())
+        self.assertEqual(reply, "+")
+        reply = seven.bitcast_to_string(self.monitor_server.get_normal_packet())
+        self.assertEqual(reply, "OK")
+        self.monitor_server.send_packet(seven.bitcast_to_bytes("+"))
+        reply = seven.bitcast_to_string(self.monitor_server.get_normal_packet())
+        self.assertEqual(reply, "+")
+
+    def get_debug_monitor_command_line_args(self, connect_address, launch_args):
+        return self.debug_monitor_extra_args + ["--reverse-connect", connect_address] + launch_args
+
+    def launch_debug_monitor(self, launch_args):
+        family, type, proto, _, addr = socket.getaddrinfo(
+            "localhost", 0, proto=socket.IPPROTO_TCP
+        )[0]
+        sock = socket.socket(family, type, proto)
+        sock.settimeout(self.DEFAULT_TIMEOUT)
+        sock.bind(addr)
+        sock.listen(1)
+        addr = sock.getsockname()
+        connect_address = "[{}]:{}".format(*addr)
+
+        commandline_args = self.get_debug_monitor_command_line_args(
+            connect_address, launch_args
+        )
+
+        # Start the server.
+        self.logger.info(f"Spawning monitor {commandline_args}")
+        monitor_process = self.spawnSubprocess(
+            self.debug_monitor_exe, commandline_args, install_remote=False
+        )
+        self.assertIsNotNone(monitor_process)
+
+        self.monitor_sock = sock.accept()[0]
+        self.monitor_sock.settimeout(self.DEFAULT_TIMEOUT)
+        return monitor_process
+
+    def connect_to_debug_monitor(self, launch_args):
+        monitor_process = self.launch_debug_monitor(launch_args)
+        self.monitor_server = lldbgdbserverutils.Server(self.monitor_sock, monitor_process)
+
+    def respond(self, packet):
+        """Subclasses can override this to change how packets are handled."""
+        return self.pass_through(packet)
+
+    def pass_through(self, packet):
+        self.logger.info(f"Sending packet {packet}")
+        self.monitor_server.send_packet(seven.bitcast_to_bytes(packet))
+        reply = seven.bitcast_to_string(self.monitor_server.get_normal_packet())
+        self.logger.info(f"Received reply {reply}")
+        return reply
diff --git a/lldb/packages/Python/lldbsuite/test/lldbreverse.py b/lldb/packages/Python/lldbsuite/test/lldbreverse.py
new file mode 100644
index 00000000000000..0f02fdffbdeada
--- /dev/null
+++ b/lldb/packages/Python/lldbsuite/test/lldbreverse.py
@@ -0,0 +1,418 @@
+import os
+import os.path
+import lldb
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test.gdbclientutils import *
+from lldbsuite.test.lldbgdbproxy import *
+import lldbgdbserverutils
+import re
+
+
+class ThreadSnapshot:
+    def __init__(self, thread_id, registers):
+        self.thread_id = thread_id
+        self.registers = registers
+
+
+class MemoryBlockSnapshot:
+    def __init__(self, address, data):
+        self.address = address
+        self.data = data
+
+
+class StateSnapshot:
+    def __init__(self, thread_snapshots, memory):
+        self.thread_snapshots = thread_snapshots
+        self.memory = memory
+        self.thread_id = None
+
+
+class RegisterInfo:
+    def __init__(self, lldb_index, bitsize, little_endian):
+        self.lldb_index = lldb_index
+        self.bitsize = bitsize
+        self.little_endian = little_endian
+
+
+BELOW_STACK_POINTER = 16384
+ABOVE_STACK_POINTER = 4096
+
+BLOCK_SIZE = 1024
+
+SOFTWARE_BREAKPOINTS = 0
+HARDWARE_BREAKPOINTS = 1
+WRITE_WATCHPOINTS = 2
+
+
+class ReverseTestBase(GDBProxyTestBase):
+    """
+    Base class for tests that need reverse execution.
+
+    This class uses a gdbserver proxy to add very limited reverse-
+    execution capability to lldb-server/debugserver for testing
+    purposes only.
+
+    To use this class, run the inferior forward until some stopping point.
+    Then call `start_recording()` and execute forward again until reaching
+    a software breakpoint; this class records the state before each execution executes.
+    At that point, the server will accept "bc" and "bs" packets to step
+    backwards through the state.
+    When executing during recording, we only allow single-step and continue without
+    delivering a signal, and only software breakpoint stops are allowed.
+
+    We assume that while recording is enabled, the only effects of instructions
+    are on general-purpose registers (read/written by the 'g' and 'G' packets)
+    and on memory bytes between [SP - BELOW_STACK_POINTER, SP + ABOVE_STACK_POINTER).
+    """
+
+    """
+    A list of StateSnapshots in time order.
+
+    There is one snapshot per single-stepped instruction,
+    representing the state before that instruction was
+    executed. The last snapshot in the list is the
+    snapshot before the last instruction was executed.
+    This is an undo log; we snapshot a superset of the state that may have
+    been changed by the instruction's execution.
+    """
+    snapshots = None
+    recording_enabled = False
+
+    breakpoints = None
+
+    pid = None
+
+    pc_register_info = None
+    sp_register_info = None
+    general_purpose_register_info = None
+
+    def __init__(self, *args, **kwargs):
+        GDBProxyTestBase.__init__(self, *args, **kwargs)
+        self.breakpoints = [set(), set(), set(), set(), set()]
+
+    def respond(self, packet):
+        if not packet:
+            raise ValueError("Invalid empty packet")
+        if packet == self.server.PACKET_INTERRUPT:
+            # Don't send a response. We'll just run to completion.
+            return []
+        if self.is_command(packet, "qSupported", ":"):
+            reply = self.pass_through(packet)
+            return reply + ";ReverseStep+;ReverseContinue+"
+        if self.is_command(packet, "vCont", ";"):
+            if self.recording_enabled:
+                return self.continue_with_recording(packet)
+            snapshots = []
+        if packet[0] == "c" or packet[0] == "s" or packet[0] == "C" or packet[0] == "S":
+            raise ValueError("LLDB should not be sending old-style continuation packets")
+        if packet == "bc":
+            return self.reverse_continue()
+        if packet == "bs":
+            return self.reverse_step()
+        if packet == 'jThreadsInfo':
+            # Suppress this because it contains thread stop reasons which we might
+            # need to modify, and we don't want to have to implement that.
+            return ""
+        if packet[0] == "z" or packet[0] == "Z":
+            reply = self.pass_through(packet)
+            if reply == "OK":
+                self.update_breakpoints(packet)
+            return reply
+        return GDBProxyTestBase.respond(self, packet)
+
+    def start_recording(self):
+        self.recording_enabled = True
+        self.snapshots = []
+
+    def stop_recording(self):
+        """
+        Don't record when executing foward.
+
+        Reverse execution is still supported until the next forward continue.
+        """
+        self.recording_enabled = False
+
+    def is_command(self, packet, cmd, follow_token):
+        return packet == cmd or packet[0:len(cmd) + 1] == cmd + follow_token
+
+    def update_breakpoints(self, packet):
+        m = re.match("([zZ])([01234]),([0-9a-f]+),([0-9a-f]+)", packet)
+        if m is None:
+            raise ValueError("Invalid breakpoint packet: " + packet)
+        t = int(m.group(2))
+        addr = int(m.group(3), 16)
+        kind = int(m.group(4), 16)
+        if m.group(1) == 'Z':
+            self.breakpoints[t].add((addr, kind))
+        else:
+            self.breakpoints[t].discard((addr, kind))
+
+    def breakpoint_triggered_at(self, pc):
+        if any(addr == pc for addr, kind in self.breakpoints[SOFTWARE_BREAKPOINTS]):
+            return True
+        if any(addr == pc for addr, kind in self.breakpoints[HARDWARE_BREAKPOINTS]):
+            return True
+        return False
+
+    def watchpoint_triggered(self, new_value_block, current_contents):
+        """Returns the address or None."""
+        for watch_addr, kind in breakpoints[WRITE_WATCHPOINTS]:
+            for offset in range(0, kind):
+                addr = watch_addr + offset
+                if (addr >= new_value_block.address and
+                    addr < new_value_block.address + len(new_value_block.data)):
+                    index = addr - new_value_block.address
+                    if new_value_block.data[index*2:(index + 1)*2] != current_contents[index*2:(index + 1)*2]:
+                        return watch_addr
+        return None
+
+    def continue_with_recording(self, packet):
+        self.logger.debug("Continue with recording enabled")
+
+        step_packet = "vCont;s"
+        if packet == "vCont":
+            requested_step = False
+        else:
+            m = re.match("vCont;(c|s)(.*)", packet)
+            if m is None:
+                raise ValueError("Unsupported vCont packet: " + packet)
+            requested_step = m.group(1) == 's'
+            step_packet += m.group(2)
+
+        while True:
+            snapshot = self.capture_snapshot()
+            reply = self.pass_through(step_packet)
+            (stop_signal, stop_pairs) = self.parse_stop(reply)
+            if stop_signal != 5:
+                raise ValueError("Unexpected stop signal: " + reply)
+            is_swbreak = False
+            thread_id = None
+            for key, value in stop_pairs.items():
+                if key == "thread":
+                    thread_id = self.parse_thread_id(value)
+                    continue
+                if re.match('[0-9a-f]+', key):
+                    continue
+                if key == "swbreak" or (key == "reason" and value == "breakpoint"):
+                    is_swbreak = True
+                    continue
+                if key in ["name", "threads", "thread-pcs", "reason"]:
+                    continue
+                raise ValueError(f"Unknown stop key '{key}' in {reply}")
+            if is_swbreak:
+                self.logger.debug("Recording stopped")
+                return reply
+            if thread_id is None:
+                return ValueError("Expected thread ID: " + reply)
+            snapshot.thread_id = thread_id
+            self.snapshots.append(snapshot)
+            if requested_step:
+                self.logger.debug("Recording stopped for step")
+                return reply
+
+    def parse_stop(self, reply):
+        result = {}
+        if not reply:
+            raise ValueError("Invalid empty packet")
+        if reply[0] == "T" and len(reply) >= 3:
+            result = {k:v for k, v in self.parse_pairs(reply[3:])}
+            return (int(reply[1:3], 16), result)
+        raise "Unsupported stop reply: " + reply
+
+    def parse_pairs(self, text):
+        for pair in text.split(";"):
+            if not pair:
+                continue
+            m = re.match("([^:]+):(.*)", pair)
+            if m is None:
+                raise ValueError("Invalid pair text: " + text)
+            yield (m.group(1), m.group(2))
+
+    def capture_snapshot(self):
+        """Snapshot all threads and their stack memories."""
+        self.ensure_register_info()
+        current_thread = self.get_current_thread()
+        thread_snapshots = []
+        memory = []
+        for thread_id in self.get_thread_list():
+            registers = {}
+            for index in sorted(self.general_purpose_register_info.keys()):
+                reply =  self.pass_through(f"p{index:x};thread:{thread_id:x};")
+                if reply == "" or reply[0] == 'E':
+                    raise ValueError("Can't read register")
+                registers[index] = reply
+            thread_snapshot = ThreadSnapshot(thread_id, registers)
+            thread_sp = self.get_register(self.sp_register_info, thread_snapshot.registers)
+            memory += self.read_memory(thread_sp - BELOW_STACK_POINTER, thread_sp + ABOVE_STACK_POINTER)
+            thread_snapshots.append(thread_snapshot)
+        self.set_current_thread(current_thread)
+        return StateSnapshot(thread_snapshots, memory)
+
+    def restore_snapshot(self, snapshot):
+        """
+        Restore the snapshot during reverse execution.
+
+        If this triggers a breakpoint or watchpoint, return the stop reply,
+        otherwise None.
+        """
+        current_thread = self.get_current_thread()
+        stop_reasons = []
+        for thread_snapshot in snapshot.thread_snapshots:
+            thread_id = thread_snapshot.thread_id
+            for lldb_index in sorted(thread_snapshot.registers.keys()):
+                data = thread_snapshot.registers[lldb_index]
+                reply = self.pass_through(f"P{lldb_index:x}={data};thread:{thread_id:x};")
+                if reply != "OK":
+                    raise ValueError("Can't restore thread register")
+            if thread_id == snapshot.thread_id:
+                new_pc = self.get_register(self.pc_register_info, thread_snapshot.registers)
+                if self.breakpoint_triggered_at(new_pc):
+                    stop_reasons.append([("reason", "breakpoint")])
+        self.set_current_thread(current_thread)
+        for block in snapshot.memory:
+            current_memory = self.pass_through(f"m{block.address:x},{(len(block.data)/2):x}")
+            if not current_memory or current_memory[0] == 'E':
+                raise ValueError("Can't read back memory")
+            reply = self.pass_through(f"M{block.address:x},{len(block.data)/2:x}:" + block.data)
+            if reply != "OK":
+                raise ValueError("Can't restore memory")
+            watch_addr = self.watchpoint_triggered(block, current_memory[1:])
+            if watch_addr is not None:
+                stop_reasons.append([("reason", "watchpoint"), ("watch", f"{watch_addr:x}")])
+        if stop_reasons:
+            pairs = ";".join(f"{key}:{value}" for key, value in stop_reasons[0])
+            return f"T05thread:{self.pid:x}.{snapshot.thread_id:x};{pairs};"
+        return None
+
+    def reverse_step(self):
+        if not self.snapshots:
+            self.logger.debug("Reverse-step at history boundary")
+            return self.history_boundary_reply(self.get_current_thread())
+        self.logger.debug("Reverse-step started")
+        snapshot = self.snapshots.pop()
+        stop_reply = self.restore_snapshot(snapshot)
+        self.set_current_thread(snapshot.thread_id)
+        self.logger.debug("Reverse-step stopped")
+        if stop_reply is None:
+            return self.singlestep_stop_reply(snapshot.thread_id)
+        return stop_reply
+
+    def reverse_continue(self):
+        self.logger.debug("Reverse-continue started")
+        thread_id = None
+        while self.snapshots:
+            snapshot = self.snapshots.pop()
+            stop_reply = self.restore_snapshot(snapshot)
+            thread_id = snapshot.thread_id
+            if stop_reply is not None:
+                self.set_current_thread(thread_id)
+                self.logger.debug("Reverse-continue stopped")
+                return stop_reply
+        if thread_id is None:
+            thread_id = self.get_current_thread()
+        else:
+            self.set_current_thread(snapshot.thread_id)
+        self.logger.debug("Reverse-continue stopped at history boundary")
+        return self.history_boundary_reply(thread_id)
+
+    def get_current_thread(self):
+        reply = self.pass_through("qC")
+        return self.parse_thread_id(reply[2:])
+
+    def parse_thread_id(self, thread_id):
+        m = re.match("(p([0-9a-f]+)[.])?([0-9a-f]+)$", thread_id)
+        if m is None:
+            raise ValueError("Invalid thread ID: " + thread_id)
+        if self.pid is None:
+            self.pid = int(m.group(2), 16)
+        return int(m.group(3), 16)
+
+    def history_boundary_reply(self, thread_id):
+        return f"T00thread:{self.pid:x}.{thread_id:x};replaylog:begin;"
+
+    def singlestep_stop_reply(self, thread_id):
+        return f"T05thread:{self.pid:x}.{thread_id:x};"
+
+    def set_current_thread(self, thread_id):
+        """
+        Set current thread in inner gdbserver.
+        """
+        if thread_id >= 0:
+            self.pass_through(f"Hg{self.pid:x}.{thread_id:x}")
+            self.pass_through(f"Hc{self.pid:x}.{thread_id:x}")
+        else:
+            self.pass_through(f"Hc-1.-1")
+            self.pass_through(f"Hg-1.-1")
+
+    def get_register(self, register_info, registers):
+        if register_info.bitsize % 8 != 0:
+            raise ValueError("Register size must be a multiple of 8 bits")
+        if register_info.lldb_index not in registers:
+            raise ValueError("Register value not captured")
+        data = registers[register_info.lldb_index]
+        num_bytes = register_info.bitsize//8
+        bytes = []
+        for i in range(0, num_bytes):
+            bytes.append(int(data[i*2:(i + 1)*2], 16))
+        if register_info.little_endian:
+            bytes.reverse()
+        result = 0
+        for byte in bytes:
+            result = (result << 8) + byte
+        return result
+
+    def read_memory(self, start_addr, end_addr):
+        """
+        Read a region of memory from the target.
+
+        Some of the addresses may extend into invalid virtual memory;
+        skip those areas.
+        Return a list of blocks containing the valid area(s) in the
+        requested range.
+        """
+        regions = []
+        start_addr = start_addr & (BLOCK_SIZE - 1)
+        end_addr = (end_addr + BLOCK_SIZE - 1) & (BLOCK_SIZE - 1)
+        for addr in range(start_addr, end_addr, BLOCK_SIZE):
+            reply = self.pass_through(f"m{addr:x},{(BLOCK_SIZE - 1):x}")
+            if reply and reply[0] != 'E':
+                block = MemoryBlockSnapshot(addr, reply[1:])
+                regions.append(block)
+        return regions
+
+    def ensure_register_info(self):
+        if self.general_purpose_register_info is not None:
+            return
+        reply = self.pass_through("qHostInfo")
+        little_endian = any(kv == ("endian", "little") for kv in self.parse_pairs(reply))
+        self.general_purpose_register_info = {}
+        lldb_index = 0
+        while True:
+            reply = self.pass_through(f"qRegisterInfo{lldb_index:x}")
+            if not reply or reply[0] == 'E':
+                break
+            info = {k:v for k, v in self.parse_pairs(reply)}
+            reg_info = RegisterInfo(lldb_index, int(info["bitsize"]), little_endian)
+            if info["set"] == "General Purpose Registers" and not "container-regs" in info:
+                self.general_purpose_register_info[lldb_index] = reg_info
+            if "generic" in info:
+                if info["generic"] == "pc":
+                    self.pc_register_info = reg_info
+                elif info["generic"] == "sp":
+                    self.sp_register_info = reg_info
+            lldb_index += 1
+        if self.pc_register_info is None or self.sp_register_info is None:
+            raise ValueError("Can't find generic pc or sp register")
+
+    def get_thread_list(self):
+        threads = []
+        reply = self.pass_through("qfThreadInfo")
+        while True:
+            if not reply:
+                raise ValueError("Missing reply packet")
+            if reply[0] == 'm':
+                for id in reply[1:].split(","):
+                    threads.append(self.parse_thread_id(id))
+            elif reply[0] == 'l':
+                return threads
+            reply = self.pass_through("qsThreadInfo")
diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py
index 8884ef5933ada8..7cc1ac9749ec93 100644
--- a/lldb/packages/Python/lldbsuite/test/lldbtest.py
+++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py
@@ -143,6 +143,8 @@
 
 STOPPED_DUE_TO_WATCHPOINT = "Process should be stopped due to watchpoint"
 
+STOPPED_DUE_TO_HISTORY_BOUNDARY = "Process should be stopped due to history boundary"
+
 DATA_TYPES_DISPLAYED_CORRECTLY = "Data type(s) displayed correctly"
 
 VALID_BREAKPOINT = "Got a valid breakpoint"
diff --git a/lldb/source/API/SBProcess.cpp b/lldb/source/API/SBProcess.cpp
index 9773144723c34c..07780f9f9c8393 100644
--- a/lldb/source/API/SBProcess.cpp
+++ b/lldb/source/API/SBProcess.cpp
@@ -564,6 +564,10 @@ uint32_t SBProcess::GetAddressByteSize() const {
 }
 
 SBError SBProcess::Continue() {
+  return Continue(RunDirection::eRunForward);
+}
+
+SBError SBProcess::Continue(RunDirection direction) {
   LLDB_INSTRUMENT_VA(this);
 
   SBError sb_error;
@@ -574,9 +578,9 @@ SBError SBProcess::Continue() {
         process_sp->GetTarget().GetAPIMutex());
 
     if (process_sp->GetTarget().GetDebugger().GetAsyncExecution())
-      sb_error.ref() = process_sp->Resume();
+      sb_error.ref() = process_sp->Resume(direction);
     else
-      sb_error.ref() = process_sp->ResumeSynchronous(nullptr);
+      sb_error.ref() = process_sp->ResumeSynchronous(nullptr, direction);
   } else
     sb_error = Status::FromErrorString("SBProcess is invalid");
 
diff --git a/lldb/source/API/SBThread.cpp b/lldb/source/API/SBThread.cpp
index a99456e06d0329..aca8a039952960 100644
--- a/lldb/source/API/SBThread.cpp
+++ b/lldb/source/API/SBThread.cpp
@@ -172,6 +172,7 @@ size_t SBThread::GetStopReasonDataCount() {
         case eStopReasonInstrumentation:
         case eStopReasonProcessorTrace:
         case eStopReasonVForkDone:
+        case eStopReasonHistoryBoundary:
           // There is no data for these stop reasons.
           return 0;
 
@@ -233,6 +234,7 @@ uint64_t SBThread::GetStopReasonDataAtIndex(uint32_t idx) {
         case eStopReasonInstrumentation:
         case eStopReasonProcessorTrace:
         case eStopReasonVForkDone:
+        case eStopReasonHistoryBoundary:
           // There is no data for these stop reasons.
           return 0;
 
diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp
index 8d3a82ef6c990a..ea60492ac46a10 100644
--- a/lldb/source/Interpreter/CommandInterpreter.cpp
+++ b/lldb/source/Interpreter/CommandInterpreter.cpp
@@ -2553,7 +2553,8 @@ bool CommandInterpreter::DidProcessStopAbnormally() const {
     const StopReason reason = stop_info->GetStopReason();
     if (reason == eStopReasonException ||
         reason == eStopReasonInstrumentation ||
-        reason == eStopReasonProcessorTrace || reason == eStopReasonInterrupt)
+        reason == eStopReasonProcessorTrace || reason == eStopReasonInterrupt ||
+        reason == eStopReasonHistoryBoundary)
       return true;
 
     if (reason == eStopReasonSignal) {
diff --git a/lldb/source/Plugins/Process/Linux/NativeThreadLinux.cpp b/lldb/source/Plugins/Process/Linux/NativeThreadLinux.cpp
index de047ee214c11e..b0aa664775b463 100644
--- a/lldb/source/Plugins/Process/Linux/NativeThreadLinux.cpp
+++ b/lldb/source/Plugins/Process/Linux/NativeThreadLinux.cpp
@@ -82,6 +82,9 @@ void LogThreadStopInfo(Log &log, const ThreadStopInfo &stop_info,
   case eStopReasonProcessorTrace:
     log.Printf("%s: %s processor trace", __FUNCTION__, header);
     return;
+  case eStopReasonHistoryBoundary:
+    log.Printf("%s: %s history boundary", __FUNCTION__, header);
+    return;
   default:
     log.Printf("%s: %s invalid stop reason %" PRIu32, __FUNCTION__, header,
                static_cast<uint32_t>(stop_info.reason));
diff --git a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp
index 9b2907c6809965..116c43343c01d1 100644
--- a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp
+++ b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp
@@ -402,9 +402,16 @@ lldb_private::DynamicLoader *ProcessKDP::GetDynamicLoader() {
 
 Status ProcessKDP::WillResume() { return Status(); }
 
-Status ProcessKDP::DoResume() {
+Status ProcessKDP::DoResume(RunDirection direction) {
   Status error;
   Log *log = GetLog(KDPLog::Process);
+
+  if (direction == RunDirection::eRunReverse) {
+    error.SetErrorStringWithFormatv(
+        "error: {0} does not support reverse execution of processes", GetPluginName());
+    return error;
+  }
+
   // Only start the async thread if we try to do any process control
   if (!m_async_thread.IsJoinable())
     StartAsyncThread();
diff --git a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.h b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.h
index e5ec5914f9600d..1b71d83f70b087 100644
--- a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.h
+++ b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.h
@@ -90,7 +90,7 @@ class ProcessKDP : public lldb_private::Process {
   // Process Control
   lldb_private::Status WillResume() override;
 
-  lldb_private::Status DoResume() override;
+  lldb_private::Status DoResume(lldb::RunDirection direction) override;
 
   lldb_private::Status DoHalt(bool &caused_stop) override;
 
diff --git a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp
index 703aa082f0476f..76b7095deaa503 100644
--- a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp
+++ b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp
@@ -204,11 +204,17 @@ ProcessWindows::DoAttachToProcessWithID(lldb::pid_t pid,
   return error;
 }
 
-Status ProcessWindows::DoResume() {
+Status ProcessWindows::DoResume(RunDirection direction) {
   Log *log = GetLog(WindowsLog::Process);
   llvm::sys::ScopedLock lock(m_mutex);
   Status error;
 
+  if (direction == RunDirection::eRunReverse) {
+    error.SetErrorStringWithFormatv(
+        "error: {0} does not support reverse execution of processes", GetPluginName());
+    return error;
+  }
+
   StateType private_state = GetPrivateState();
   if (private_state == eStateStopped || private_state == eStateCrashed) {
     LLDB_LOG(log, "process {0} is in state {1}.  Resuming...",
diff --git a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.h b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.h
index e97cfb790248be..97284b7cd1436e 100644
--- a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.h
+++ b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.h
@@ -52,7 +52,7 @@ class ProcessWindows : public Process, public ProcessDebugger {
   Status DoAttachToProcessWithID(
       lldb::pid_t pid,
       const lldb_private::ProcessAttachInfo &attach_info) override;
-  Status DoResume() override;
+  Status DoResume(lldb::RunDirection direction) override;
   Status DoDestroy() override;
   Status DoHalt(bool &caused_stop) override;
 
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
index e42526c8fd7266..fc792a4409410b 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
@@ -199,6 +199,20 @@ uint64_t GDBRemoteCommunicationClient::GetRemoteMaxPacketSize() {
   return m_max_packet_size;
 }
 
+bool GDBRemoteCommunicationClient::GetReverseContinueSupported() {
+  if (m_supports_reverse_continue == eLazyBoolCalculate) {
+    GetRemoteQSupported();
+  }
+  return m_supports_reverse_continue == eLazyBoolYes;
+}
+
+bool GDBRemoteCommunicationClient::GetReverseStepSupported() {
+  if (m_supports_reverse_step == eLazyBoolCalculate) {
+    GetRemoteQSupported();
+  }
+  return m_supports_reverse_step == eLazyBoolYes;
+}
+
 bool GDBRemoteCommunicationClient::QueryNoAckModeSupported() {
   if (m_supports_not_sending_acks == eLazyBoolCalculate) {
     m_send_acks = true;
@@ -295,6 +309,8 @@ void GDBRemoteCommunicationClient::ResetDiscoverableSettings(bool did_exec) {
     m_supports_qXfer_siginfo_read = eLazyBoolCalculate;
     m_supports_augmented_libraries_svr4_read = eLazyBoolCalculate;
     m_uses_native_signals = eLazyBoolCalculate;
+    m_supports_reverse_continue = eLazyBoolCalculate;
+    m_supports_reverse_step = eLazyBoolCalculate;
     m_supports_qProcessInfoPID = true;
     m_supports_qfProcessInfo = true;
     m_supports_qUserName = true;
@@ -348,6 +364,8 @@ void GDBRemoteCommunicationClient::GetRemoteQSupported() {
   m_supports_memory_tagging = eLazyBoolNo;
   m_supports_qSaveCore = eLazyBoolNo;
   m_uses_native_signals = eLazyBoolNo;
+  m_supports_reverse_continue = eLazyBoolNo;
+  m_supports_reverse_step = eLazyBoolNo;
 
   m_max_packet_size = UINT64_MAX; // It's supposed to always be there, but if
                                   // not, we assume no limit
@@ -401,6 +419,10 @@ void GDBRemoteCommunicationClient::GetRemoteQSupported() {
         m_supports_qSaveCore = eLazyBoolYes;
       else if (x == "native-signals+")
         m_uses_native_signals = eLazyBoolYes;
+      else if (x == "ReverseContinue+")
+        m_supports_reverse_continue = eLazyBoolYes;
+      else if (x == "ReverseStep+")
+        m_supports_reverse_step = eLazyBoolYes;
       // Look for a list of compressions in the features list e.g.
       // qXfer:features:read+;PacketSize=20000;qEcho+;SupportedCompressions=zlib-
       // deflate,lzma
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h
index 898d176abc3465..116b47c1edf033 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h
@@ -331,6 +331,10 @@ class GDBRemoteCommunicationClient : public GDBRemoteClientBase {
 
   bool GetMultiprocessSupported();
 
+  bool GetReverseContinueSupported();
+
+  bool GetReverseStepSupported();
+
   LazyBool SupportsAllocDeallocMemory() // const
   {
     // Uncomment this to have lldb pretend the debug server doesn't respond to
@@ -561,6 +565,8 @@ class GDBRemoteCommunicationClient : public GDBRemoteClientBase {
   LazyBool m_supports_memory_tagging = eLazyBoolCalculate;
   LazyBool m_supports_qSaveCore = eLazyBoolCalculate;
   LazyBool m_uses_native_signals = eLazyBoolCalculate;
+  LazyBool m_supports_reverse_continue = eLazyBoolCalculate;
+  LazyBool m_supports_reverse_step = eLazyBoolCalculate;
 
   bool m_supports_qProcessInfoPID : 1, m_supports_qfProcessInfo : 1,
       m_supports_qUserName : 1, m_supports_qGroupName : 1,
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp
index 35fa93e53bc66f..4016cde74ebea8 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp
@@ -716,6 +716,7 @@ static const char *GetStopReasonString(StopReason stop_reason) {
     return "vforkdone";
   case eStopReasonInterrupt:
     return "async interrupt";
+  case eStopReasonHistoryBoundary:
   case eStopReasonInstrumentation:
   case eStopReasonInvalid:
   case eStopReasonPlanComplete:
diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
index 3e09c316d74f44..3fc03bd05d5df0 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
@@ -169,6 +169,10 @@ class PluginProperties : public Properties {
   }
 };
 
+std::chrono::seconds ResumeTimeout() {
+  return std::chrono::seconds(5);
+}
+
 } // namespace
 
 static PluginProperties &GetGlobalPluginProperties() {
@@ -1180,10 +1184,11 @@ Status ProcessGDBRemote::WillResume() {
   return Status();
 }
 
-Status ProcessGDBRemote::DoResume() {
+Status ProcessGDBRemote::DoResume(RunDirection direction) {
   Status error;
   Log *log = GetLog(GDBRLog::Process);
-  LLDB_LOGF(log, "ProcessGDBRemote::Resume()");
+  LLDB_LOGF(log, "ProcessGDBRemote::Resume(%s)",
+            direction == RunDirection::eRunForward ? "" : "reverse");
 
   ListenerSP listener_sp(
       Listener::MakeListener("gdb-remote.resume-packet-sent"));
@@ -1197,12 +1202,21 @@ Status ProcessGDBRemote::DoResume() {
 
     StreamString continue_packet;
     bool continue_packet_error = false;
-    if (m_gdb_comm.HasAnyVContSupport()) {
+    // Number of threads continuing with "c", i.e. continuing without a signal to deliver.
+    const size_t num_continue_c_tids = m_continue_c_tids.size();
+    // Number of threads continuing with "C", i.e. continuing with a signal to deliver.
+    const size_t num_continue_C_tids = m_continue_C_tids.size();
+    // Number of threads continuing with "s", i.e. single-stepping.
+    const size_t num_continue_s_tids = m_continue_s_tids.size();
+    // Number of threads continuing with "S", i.e. single-stepping with a signal to deliver.
+    const size_t num_continue_S_tids = m_continue_S_tids.size();
+    if (direction == RunDirection::eRunForward &&
+        m_gdb_comm.HasAnyVContSupport()) {
       std::string pid_prefix;
       if (m_gdb_comm.GetMultiprocessSupported())
         pid_prefix = llvm::formatv("p{0:x-}.", GetID());
 
-      if (m_continue_c_tids.size() == num_threads ||
+      if (num_continue_c_tids == num_threads ||
           (m_continue_c_tids.empty() && m_continue_C_tids.empty() &&
            m_continue_s_tids.empty() && m_continue_S_tids.empty())) {
         // All threads are continuing
@@ -1265,14 +1279,11 @@ Status ProcessGDBRemote::DoResume() {
     } else
       continue_packet_error = true;
 
-    if (continue_packet_error) {
+    if (direction == RunDirection::eRunForward && continue_packet_error) {
       // Either no vCont support, or we tried to use part of the vCont packet
-      // that wasn't supported by the remote GDB server. We need to try and
-      // make a simple packet that can do our continue
-      const size_t num_continue_c_tids = m_continue_c_tids.size();
-      const size_t num_continue_C_tids = m_continue_C_tids.size();
-      const size_t num_continue_s_tids = m_continue_s_tids.size();
-      const size_t num_continue_S_tids = m_continue_S_tids.size();
+      // that wasn't supported by the remote GDB server, or it's the reverse
+      // direction. We need to try and make a simple packet that can do our
+      // continue.
       if (num_continue_c_tids > 0) {
         if (num_continue_c_tids == num_threads) {
           // All threads are resuming...
@@ -1363,9 +1374,41 @@ Status ProcessGDBRemote::DoResume() {
       }
     }
 
+    if (direction == RunDirection::eRunReverse && continue_packet_error) {
+      if (num_continue_C_tids > 0 || num_continue_S_tids > 0) {
+        LLDB_LOGF(log, "ProcessGDBRemote::DoResumeReverse: Signals not supported");
+        return Status::FromErrorString("can't deliver signals while running in reverse");
+      }
+
+      if (num_continue_s_tids > 0) {
+        if (num_continue_s_tids > 1) {
+          LLDB_LOGF(log, "ProcessGDBRemote::DoResumeReverse: can't step multiple threads");
+          return Status::FromErrorString("can't step multiple threads while reverse-stepping");
+        }
+
+        if (!m_gdb_comm.GetReverseStepSupported()) {
+          LLDB_LOGF(log, "ProcessGDBRemote::DoResumeReverse: target does not support reverse-stepping");
+          return Status::FromErrorString("target does not support reverse-stepping");
+        }
+
+        m_gdb_comm.SetCurrentThreadForRun(m_continue_s_tids.front());
+        continue_packet.PutCString("bs");
+      } else {
+        if (!m_gdb_comm.GetReverseContinueSupported()) {
+          LLDB_LOGF(log, "ProcessGDBRemote::DoResumeReverse: target does not support reverse-continue");
+          return Status::FromErrorString("target does not support reverse-continue");
+        }
+
+        // All threads continue whether requested or not ---
+        // we can't change how threads ran in the past.
+        continue_packet.PutCString("bc");
+      }
+
+      continue_packet_error = false;
+    }
+
     if (continue_packet_error) {
-      error =
-          Status::FromErrorString("can't make continue packet for this resume");
+      return Status::FromErrorString("can't make continue packet for this resume");
     } else {
       EventSP event_sp;
       if (!m_async_thread.IsJoinable()) {
@@ -1380,7 +1423,7 @@ Status ProcessGDBRemote::DoResume() {
           std::make_shared<EventDataBytes>(continue_packet.GetString());
       m_async_broadcaster.BroadcastEvent(eBroadcastBitAsyncContinue, data_sp);
 
-      if (!listener_sp->GetEvent(event_sp, std::chrono::seconds(5))) {
+      if (!listener_sp->GetEvent(event_sp, ResumeTimeout())) {
         error = Status::FromErrorString("Resume timed out.");
         LLDB_LOGF(log, "ProcessGDBRemote::DoResume: Resume timed out.");
       } else if (event_sp->BroadcasterIs(&m_async_broadcaster)) {
@@ -1863,6 +1906,10 @@ ThreadSP ProcessGDBRemote::SetThreadStopInfo(
           thread_sp->SetStopInfo(StopInfo::CreateStopReasonWithException(
               *thread_sp, description.c_str()));
           handled = true;
+        } else if (reason == "replaylog") {
+          thread_sp->SetStopInfo(StopInfo::CreateStopReasonHistoryBoundary(
+              *thread_sp, description.c_str()));
+          handled = true;
         } else if (reason == "exec") {
           did_exec = true;
           thread_sp->SetStopInfo(
@@ -2318,6 +2365,8 @@ StateType ProcessGDBRemote::SetThreadStopInfo(StringExtractor &stop_packet) {
         description = std::string(ostr.GetString());
       } else if (key.compare("swbreak") == 0 || key.compare("hwbreak") == 0) {
         reason = "breakpoint";
+      } else if (key.compare("replaylog") == 0) {
+        reason = "replaylog";
       } else if (key.compare("library") == 0) {
         auto error = LoadModules();
         if (error) {
diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h
index 2492795851388a..fa3e1cec76e2b3 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h
@@ -111,7 +111,7 @@ class ProcessGDBRemote : public Process,
   // Process Control
   Status WillResume() override;
 
-  Status DoResume() override;
+  Status DoResume(lldb::RunDirection direction) override;
 
   Status DoHalt(bool &caused_stop) override;
 
diff --git a/lldb/source/Plugins/Process/scripted/ScriptedProcess.cpp b/lldb/source/Plugins/Process/scripted/ScriptedProcess.cpp
index d2111ce877ce55..304c12173dd35d 100644
--- a/lldb/source/Plugins/Process/scripted/ScriptedProcess.cpp
+++ b/lldb/source/Plugins/Process/scripted/ScriptedProcess.cpp
@@ -182,10 +182,15 @@ void ScriptedProcess::DidResume() {
   m_pid = GetInterface().GetProcessID();
 }
 
-Status ScriptedProcess::DoResume() {
+Status ScriptedProcess::DoResume(RunDirection direction) {
   LLDB_LOGF(GetLog(LLDBLog::Process), "ScriptedProcess::%s resuming process", __FUNCTION__);
 
-  return GetInterface().Resume();
+  if (direction == RunDirection::eRunForward) {
+    return GetInterface().Resume();
+  } else {
+    return Status::FromErrorStringWithFormatv(
+        "error: {0} does not support reverse execution of processes", GetPluginName());
+  }
 }
 
 Status ScriptedProcess::DoAttach(const ProcessAttachInfo &attach_info) {
diff --git a/lldb/source/Plugins/Process/scripted/ScriptedProcess.h b/lldb/source/Plugins/Process/scripted/ScriptedProcess.h
index 0335364b4010b2..8ebe4ca5f3d449 100644
--- a/lldb/source/Plugins/Process/scripted/ScriptedProcess.h
+++ b/lldb/source/Plugins/Process/scripted/ScriptedProcess.h
@@ -52,7 +52,7 @@ class ScriptedProcess : public Process {
 
   void DidResume() override;
 
-  Status DoResume() override;
+  Status DoResume(lldb::RunDirection direction) override;
 
   Status DoAttachToProcessWithID(lldb::pid_t pid,
                                  const ProcessAttachInfo &attach_info) override;
diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp
index aca08972811470..ff6a2f59eba35f 100644
--- a/lldb/source/Target/Process.cpp
+++ b/lldb/source/Target/Process.cpp
@@ -446,7 +446,8 @@ Process::Process(lldb::TargetSP target_sp, ListenerSP listener_sp,
       m_memory_cache(*this), m_allocated_memory_cache(*this),
       m_should_detach(false), m_next_event_action_up(), m_public_run_lock(),
       m_private_run_lock(), m_currently_handling_do_on_removals(false),
-      m_resume_requested(false), m_interrupt_tid(LLDB_INVALID_THREAD_ID),
+      m_resume_requested(false), m_last_run_direction(eRunForward),
+      m_interrupt_tid(LLDB_INVALID_THREAD_ID),
       m_finalizing(false), m_destructing(false),
       m_clear_thread_plans_on_stop(false), m_force_next_event_delivery(false),
       m_last_broadcast_state(eStateInvalid), m_destroy_in_process(false),
@@ -845,6 +846,7 @@ bool Process::HandleProcessStateChangedEvent(
             switch (thread_stop_reason) {
             case eStopReasonInvalid:
             case eStopReasonNone:
+            case eStopReasonHistoryBoundary:
               break;
 
             case eStopReasonSignal: {
@@ -1352,7 +1354,7 @@ void Process::SetPublicState(StateType new_state, bool restarted) {
   }
 }
 
-Status Process::Resume() {
+Status Process::Resume(RunDirection direction) {
   Log *log(GetLog(LLDBLog::State | LLDBLog::Process));
   LLDB_LOGF(log, "(plugin = %s) -- locking run lock", GetPluginName().data());
   if (!m_public_run_lock.TrySetRunning()) {
@@ -1361,7 +1363,7 @@ Status Process::Resume() {
     return Status::FromErrorString(
         "Resume request failed - process still running.");
   }
-  Status error = PrivateResume();
+  Status error = PrivateResume(direction);
   if (!error.Success()) {
     // Undo running state change
     m_public_run_lock.SetStopped();
@@ -1369,7 +1371,7 @@ Status Process::Resume() {
   return error;
 }
 
-Status Process::ResumeSynchronous(Stream *stream) {
+Status Process::ResumeSynchronous(Stream *stream, RunDirection direction) {
   Log *log(GetLog(LLDBLog::State | LLDBLog::Process));
   LLDB_LOGF(log, "Process::ResumeSynchronous -- locking run lock");
   if (!m_public_run_lock.TrySetRunning()) {
@@ -1382,7 +1384,7 @@ Status Process::ResumeSynchronous(Stream *stream) {
       Listener::MakeListener(ResumeSynchronousHijackListenerName.data()));
   HijackProcessEvents(listener_sp);
 
-  Status error = PrivateResume();
+  Status error = PrivateResume(direction);
   if (error.Success()) {
     StateType state =
         WaitForProcessToStop(std::nullopt, nullptr, true, listener_sp, stream,
@@ -3239,7 +3241,7 @@ Status Process::ConnectRemote(llvm::StringRef remote_url) {
   return error;
 }
 
-Status Process::PrivateResume() {
+Status Process::PrivateResume(RunDirection direction) {
   Log *log(GetLog(LLDBLog::Process | LLDBLog::Step));
   LLDB_LOGF(log,
             "Process::PrivateResume() m_stop_id = %u, public state: %s "
@@ -3255,6 +3257,15 @@ Status Process::PrivateResume() {
   if (!GetModID().IsLastResumeForUserExpression())
     ResetExtendedCrashInfoDict();
 
+  if (m_last_run_direction != direction) {
+    // In the future we might want to support mixed-direction plans,
+    // e.g. a forward step-over stops at a breakpoint, the user does
+    // a reverse-step, then disables the breakpoint and continues forward.
+    // This code will need to be changed to support that.
+    m_thread_list.DiscardThreadPlans();
+    m_last_run_direction = direction;
+  }
+
   Status error(WillResume());
   // Tell the process it is about to resume before the thread list
   if (error.Success()) {
@@ -3272,7 +3283,7 @@ Status Process::PrivateResume() {
             "Process::PrivateResume PreResumeActions failed, not resuming.");
       } else {
         m_mod_id.BumpResumeID();
-        error = DoResume();
+        error = DoResume(direction);
         if (error.Success()) {
           DidResume();
           m_thread_list.DidResume();
@@ -3735,7 +3746,7 @@ bool Process::ShouldBroadcastEvent(Event *event_ptr) {
                     "from state: %s",
                     static_cast<void *>(event_ptr), StateAsCString(state));
           ProcessEventData::SetRestartedInEvent(event_ptr, true);
-          PrivateResume();
+          PrivateResume(m_last_run_direction);
         }
       } else {
         return_value = true;
@@ -4346,7 +4357,7 @@ void Process::ProcessEventData::DoOnRemoval(Event *event_ptr) {
     SetRestarted(true);
     // Use the private resume method here, since we aren't changing the run
     // lock state.
-    process_sp->PrivateResume();
+    process_sp->PrivateResume(process_sp->m_last_run_direction);
   } else {
     bool hijacked = process_sp->IsHijackedForEvent(eBroadcastBitStateChanged) &&
                     !process_sp->StateChangedIsHijackedForSynchronousResume();
diff --git a/lldb/source/Target/StopInfo.cpp b/lldb/source/Target/StopInfo.cpp
index bd7032b803df90..08e9a7c099bad2 100644
--- a/lldb/source/Target/StopInfo.cpp
+++ b/lldb/source/Target/StopInfo.cpp
@@ -1212,6 +1212,30 @@ class StopInfoProcessorTrace : public StopInfo {
   }
 };
 
+// StopInfoHistoryBoundary
+
+class StopInfoHistoryBoundary : public StopInfo {
+public:
+  StopInfoHistoryBoundary(Thread &thread, const char *description)
+      : StopInfo(thread, LLDB_INVALID_UID) {
+    if (description)
+      SetDescription(description);
+  }
+
+  ~StopInfoHistoryBoundary() override = default;
+
+  StopReason GetStopReason() const override {
+    return eStopReasonHistoryBoundary;
+  }
+
+  const char *GetDescription() override {
+    if (m_description.empty())
+      return "history boundary";
+    else
+      return m_description.c_str();
+  }
+};
+
 // StopInfoThreadPlan
 
 class StopInfoThreadPlan : public StopInfo {
@@ -1439,6 +1463,11 @@ StopInfoSP StopInfo::CreateStopReasonProcessorTrace(Thread &thread,
   return StopInfoSP(new StopInfoProcessorTrace(thread, description));
 }
 
+StopInfoSP StopInfo::CreateStopReasonHistoryBoundary(Thread &thread,
+                                                     const char *description) {
+  return StopInfoSP(new StopInfoHistoryBoundary(thread, description));
+}
+
 StopInfoSP StopInfo::CreateStopReasonWithExec(Thread &thread) {
   return StopInfoSP(new StopInfoExec(thread));
 }
diff --git a/lldb/source/Target/Thread.cpp b/lldb/source/Target/Thread.cpp
index 902fbb2b519ef7..bbb586f033b746 100644
--- a/lldb/source/Target/Thread.cpp
+++ b/lldb/source/Target/Thread.cpp
@@ -624,10 +624,12 @@ void Thread::SetupForResume() {
     // what the current plan is.
 
     lldb::RegisterContextSP reg_ctx_sp(GetRegisterContext());
-    if (reg_ctx_sp) {
+    ProcessSP process_sp(GetProcess());
+    if (reg_ctx_sp && process_sp &&
+        process_sp->GetLastRunDirection() == eRunForward) {
       const addr_t thread_pc = reg_ctx_sp->GetPC();
       BreakpointSiteSP bp_site_sp =
-          GetProcess()->GetBreakpointSiteList().FindByAddress(thread_pc);
+          process_sp->GetBreakpointSiteList().FindByAddress(thread_pc);
       if (bp_site_sp) {
         // Note, don't assume there's a ThreadPlanStepOverBreakpoint, the
         // target may not require anything special to step over a breakpoint.
@@ -1732,6 +1734,8 @@ std::string Thread::StopReasonAsString(lldb::StopReason reason) {
     return "processor trace";
   case eStopReasonInterrupt:
     return "async interrupt";
+  case eStopReasonHistoryBoundary:
+    return "history boundary";
   }
 
   return "StopReason = " + std::to_string(reason);
diff --git a/lldb/test/API/functionalities/reverse-execution/Makefile b/lldb/test/API/functionalities/reverse-execution/Makefile
new file mode 100644
index 00000000000000..10495940055b63
--- /dev/null
+++ b/lldb/test/API/functionalities/reverse-execution/Makefile
@@ -0,0 +1,3 @@
+C_SOURCES := main.c
+
+include Makefile.rules
diff --git a/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py b/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py
new file mode 100644
index 00000000000000..b37578fbd82468
--- /dev/null
+++ b/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py
@@ -0,0 +1,115 @@
+import lldb
+import time
+import unittest
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test.decorators import *
+from lldbsuite.test.gdbclientutils import *
+from lldbsuite.test.lldbreverse import ReverseTestBase
+from lldbsuite.test import lldbutil
+
+
+class TestReverseContinueBreakpoints(ReverseTestBase):
+    NO_DEBUG_INFO_TESTCASE = True
+
+    def test_reverse_continue(self):
+        self.reverse_continue_internal(async_mode=False)
+
+    def test_reverse_continue_async(self):
+        self.reverse_continue_internal(async_mode=True)
+
+    def reverse_continue_internal(self, async_mode):
+        target, process, initial_threads = self.setup_recording(async_mode)
+
+        # Reverse-continue. We'll stop at the point where we started recording.
+        status = process.Continue(lldb.eRunReverse)
+        self.assertSuccess(status)
+        self.expect_async_state_changes(async_mode, process, [lldb.eStateRunning, lldb.eStateStopped])
+        self.expect(
+            "thread list",
+            STOPPED_DUE_TO_HISTORY_BOUNDARY,
+            substrs=["stopped", "stop reason = history boundary"],
+        )
+
+        # Continue forward normally until the target exits.
+        status = process.Continue()
+        self.expect_async_state_changes(async_mode, process, [lldb.eStateRunning, lldb.eStateExited])
+        self.assertSuccess(status)
+        self.assertState(process.GetState(), lldb.eStateExited)
+        self.assertEqual(process.GetExitStatus(), 0)
+
+    def test_reverse_continue_breakpoint(self):
+        self.reverse_continue_breakpoint_internal(async_mode=False)
+
+    def test_reverse_continue_breakpoint_async(self):
+        self.reverse_continue_breakpoint_internal(async_mode=True)
+
+    def reverse_continue_breakpoint_internal(self, async_mode):
+        target, process, initial_threads = self.setup_recording(async_mode)
+
+        # Reverse-continue to the function "trigger_breakpoint".
+        trigger_bkpt = target.BreakpointCreateByName("trigger_breakpoint", None)
+        status = process.Continue(lldb.eRunReverse)
+        self.assertSuccess(status)
+        self.expect_async_state_changes(async_mode, process, [lldb.eStateRunning, lldb.eStateStopped])
+        threads_now = lldbutil.get_threads_stopped_at_breakpoint(process, trigger_bkpt)
+        self.assertEqual(threads_now, initial_threads)
+
+    def test_reverse_continue_skip_breakpoint(self):
+        self.reverse_continue_skip_breakpoint_internal(async_mode=False)
+
+    def test_reverse_continue_skip_breakpoint_async(self):
+        self.reverse_continue_skip_breakpoint_internal(async_mode=True)
+
+    def reverse_continue_skip_breakpoint_internal(self, async_mode):
+        target, process, initial_threads = self.setup_recording(async_mode)
+
+        # Reverse-continue over a breakpoint at "trigger_breakpoint" whose
+        # condition is false.
+        # This tests that we continue in the correct direction after hitting
+        # the breakpoint.
+        trigger_bkpt = target.BreakpointCreateByName("trigger_breakpoint", None)
+        trigger_bkpt.SetCondition("false_condition")
+        status = process.Continue(lldb.eRunReverse)
+        self.expect_async_state_changes(async_mode, process, [lldb.eStateRunning, lldb.eStateStopped])
+        self.assertSuccess(status)
+        self.expect(
+            "thread list",
+            STOPPED_DUE_TO_HISTORY_BOUNDARY,
+            substrs=["stopped", "stop reason = history boundary"],
+        )
+
+    def setup_recording(self, async_mode):
+        """
+        Record execution of code between "start_recording" and "stop_recording" breakpoints.
+
+        Returns with the target stopped at "stop_recording", with recording disabled,
+        ready to reverse-execute.
+        """
+        self.build()
+        target = self.dbg.CreateTarget("")
+        process = self.connect(target)
+
+        # Record execution from the start of the function "start_recording"
+        # to the start of the function "stop_recording". We want to keep the
+        # interval that we record as small as possible to minimize the run-time
+        # of our single-stepping recorder.
+        start_recording_bkpt = target.BreakpointCreateByName("start_recording", None)
+        initial_threads = lldbutil.continue_to_breakpoint(process, start_recording_bkpt)
+        self.assertEqual(len(initial_threads), 1)
+        target.BreakpointDelete(start_recording_bkpt.GetID())
+        self.start_recording()
+        stop_recording_bkpt = target.BreakpointCreateByName("stop_recording", None)
+        lldbutil.continue_to_breakpoint(process, stop_recording_bkpt)
+        target.BreakpointDelete(stop_recording_bkpt.GetID())
+        self.stop_recording()
+
+        self.dbg.SetAsync(async_mode)
+        self.expect_async_state_changes(async_mode, process, [lldb.eStateStopped])
+
+        return target, process, initial_threads
+
+    def expect_async_state_changes(self, async_mode, process, states):
+        if not async_mode:
+            return
+        listener = self.dbg.GetListener()
+        lldbutil.expect_state_changes(self, listener, process, states)
diff --git a/lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py b/lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py
new file mode 100644
index 00000000000000..d610761b8cb0bc
--- /dev/null
+++ b/lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py
@@ -0,0 +1,30 @@
+import lldb
+import unittest
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test.decorators import *
+from lldbsuite.test import lldbutil
+
+
+class TestReverseContinueNotSupported(TestBase):
+    NO_DEBUG_INFO_TESTCASE = True
+
+    def test_reverse_continue_not_supported(self):
+        self.build()
+        exe = self.getBuildArtifact("a.out")
+        target = self.dbg.CreateTarget(exe)
+        self.assertTrue(target, VALID_TARGET)
+
+        main_bkpt = target.BreakpointCreateByName("main", None)
+        self.assertTrue(main_bkpt, VALID_BREAKPOINT)
+
+        process = target.LaunchSimple(None, None, self.get_process_working_directory())
+        self.assertTrue(process, PROCESS_IS_VALID)
+
+        # This will fail gracefully.
+        status = process.Continue(lldb.eRunReverse)
+        self.assertFailure(status, "target does not support reverse-continue")
+
+        status = process.Continue()
+        self.assertSuccess(status)
+        self.assertState(process.GetState(), lldb.eStateExited)
+        self.assertEqual(process.GetExitStatus(), 0)
diff --git a/lldb/test/API/functionalities/reverse-execution/main.c b/lldb/test/API/functionalities/reverse-execution/main.c
new file mode 100644
index 00000000000000..40e45dc9f5c317
--- /dev/null
+++ b/lldb/test/API/functionalities/reverse-execution/main.c
@@ -0,0 +1,14 @@
+volatile int false_condition = 0;
+
+static void start_recording() {}
+
+static void trigger_breakpoint() {}
+
+static void stop_recording() {}
+
+int main() {
+  start_recording();
+  trigger_breakpoint();
+  stop_recording();
+  return 0;
+}
diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp
index 558f889c4b7f23..211fd34957f496 100644
--- a/lldb/tools/lldb-dap/JSONUtils.cpp
+++ b/lldb/tools/lldb-dap/JSONUtils.cpp
@@ -1045,6 +1045,9 @@ llvm::json::Value CreateThreadStopped(lldb::SBThread &thread,
   case lldb::eStopReasonProcessorTrace:
     body.try_emplace("reason", "processor trace");
     break;
+  case lldb::eStopReasonHistoryBoundary:
+    body.try_emplace("reason", "history boundary");
+    break;
   case lldb::eStopReasonSignal:
   case lldb::eStopReasonException:
     body.try_emplace("reason", "exception");
diff --git a/lldb/tools/lldb-dap/LLDBUtils.cpp b/lldb/tools/lldb-dap/LLDBUtils.cpp
index b38833c0fdb6b6..1c5e3ac7008727 100644
--- a/lldb/tools/lldb-dap/LLDBUtils.cpp
+++ b/lldb/tools/lldb-dap/LLDBUtils.cpp
@@ -111,6 +111,7 @@ bool ThreadHasStopReason(lldb::SBThread &thread) {
   case lldb::eStopReasonVFork:
   case lldb::eStopReasonVForkDone:
   case lldb::eStopReasonInterrupt:
+  case lldb::eStopReasonHistoryBoundary:
     return true;
   case lldb::eStopReasonThreadExiting:
   case lldb::eStopReasonInvalid:

From f0ed31ce4b63a5530fd1de875c0d1467d4d2c6ea Mon Sep 17 00:00:00 2001
From: Youngsuk Kim <youngsuk.kim@hpe.com>
Date: Thu, 10 Oct 2024 16:02:13 -0400
Subject: [PATCH 073/177] [llvm][PGOCtxProfLowering] Avoid Type::getPointerTo()
 (NFC) (#111857)

`Type::getPointerTo()` is to be deprecated & removed soon.
---
 .../Instrumentation/PGOCtxProfLowering.cpp    | 23 +++++++++----------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp
index b620306628729b..e7b7c26c493e50 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp
@@ -154,15 +154,15 @@ CtxInstrumentationLowerer::CtxInstrumentationLowerer(Module &M,
   StartCtx = cast<Function>(
       M.getOrInsertFunction(
            CompilerRtAPINames::StartCtx,
-           FunctionType::get(ContextNodeTy->getPointerTo(),
-                             {ContextRootTy->getPointerTo(), /*ContextRoot*/
+           FunctionType::get(PointerTy,
+                             {PointerTy, /*ContextRoot*/
                               I64Ty, /*Guid*/ I32Ty,
                               /*NumCounters*/ I32Ty /*NumCallsites*/},
                              false))
           .getCallee());
   GetCtx = cast<Function>(
       M.getOrInsertFunction(CompilerRtAPINames::GetCtx,
-                            FunctionType::get(ContextNodeTy->getPointerTo(),
+                            FunctionType::get(PointerTy,
                                               {PointerTy, /*Callee*/
                                                I64Ty,     /*Guid*/
                                                I32Ty,     /*NumCounters*/
@@ -170,13 +170,12 @@ CtxInstrumentationLowerer::CtxInstrumentationLowerer(Module &M,
                                               false))
           .getCallee());
   ReleaseCtx = cast<Function>(
-      M.getOrInsertFunction(
-           CompilerRtAPINames::ReleaseCtx,
-           FunctionType::get(Type::getVoidTy(M.getContext()),
-                             {
-                                 ContextRootTy->getPointerTo(), /*ContextRoot*/
-                             },
-                             false))
+      M.getOrInsertFunction(CompilerRtAPINames::ReleaseCtx,
+                            FunctionType::get(Type::getVoidTy(M.getContext()),
+                                              {
+                                                  PointerTy, /*ContextRoot*/
+                                              },
+                                              false))
           .getCallee());
 
   // Declare the TLSes we will need to use.
@@ -264,7 +263,7 @@ bool CtxInstrumentationLowerer::lowerFunction(Function &F) {
         auto *Index = Builder.CreateAnd(CtxAsInt, Builder.getInt64(1));
         // The GEPs corresponding to that index, in the respective TLS.
         ExpectedCalleeTLSAddr = Builder.CreateGEP(
-            Builder.getInt8Ty()->getPointerTo(),
+            PointerType::getUnqual(F.getContext()),
             Builder.CreateThreadLocalAddress(ExpectedCalleeTLS), {Index});
         CallsiteInfoTLSAddr = Builder.CreateGEP(
             Builder.getInt32Ty(),
@@ -277,7 +276,7 @@ bool CtxInstrumentationLowerer::lowerFunction(Function &F) {
       // with counters) stays the same.
       RealContext = Builder.CreateIntToPtr(
           Builder.CreateAnd(CtxAsInt, Builder.getInt64(-2)),
-          ThisContextType->getPointerTo());
+          PointerType::getUnqual(F.getContext()));
       I.eraseFromParent();
       break;
     }

From 942fefe74112acb68fa43dde44abe3ae125457e1 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 10 Oct 2024 11:23:11 -0700
Subject: [PATCH 074/177] [NFC][sanitizer] Reopen '/proc/%d/task' instead of
 seek

NFC because I am not aware of any particular
issue from seek, but reopen looks less error prone.

Pull Request: https://github.com/llvm/llvm-project/pull/111899
---
 .../lib/sanitizer_common/sanitizer_linux.cpp  | 30 +++++++------------
 .../lib/sanitizer_common/sanitizer_linux.h    |  5 ++--
 2 files changed, 13 insertions(+), 22 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
index d421d117e67274..70fd9405e5454f 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
@@ -1025,21 +1025,19 @@ bool internal_sigismember(__sanitizer_sigset_t *set, int signum) {
 
 #  if !SANITIZER_NETBSD
 // ThreadLister implementation.
-ThreadLister::ThreadLister(pid_t pid) : pid_(pid), buffer_(4096) {
-  char task_directory_path[80];
-  internal_snprintf(task_directory_path, sizeof(task_directory_path),
-                    "/proc/%d/task/", pid);
-  descriptor_ = internal_open(task_directory_path, O_RDONLY | O_DIRECTORY);
-  if (internal_iserror(descriptor_)) {
-    Report("Can't open /proc/%d/task for reading.\n", pid);
-  }
+ThreadLister::ThreadLister(pid_t pid) : buffer_(4096) {
+  task_path_.AppendF("/proc/%d/task", pid);
+  status_path_.AppendF("%s/status", task_path_.data());
 }
 
 ThreadLister::Result ThreadLister::ListThreads(
     InternalMmapVector<tid_t> *threads) {
-  if (internal_iserror(descriptor_))
+  int descriptor = internal_open(task_path_.data(), O_RDONLY | O_DIRECTORY);
+  if (internal_iserror(descriptor)) {
+    Report("Can't open %s for reading.\n", task_path_.data());
     return Error;
-  internal_lseek(descriptor_, 0, SEEK_SET);
+  }
+  auto acts_cleanup = at_scope_exit([&] { internal_close(descriptor); });
   threads->clear();
 
   Result result = Ok;
@@ -1048,11 +1046,11 @@ ThreadLister::Result ThreadLister::ListThreads(
     buffer_.resize(buffer_.capacity());
     CHECK_GE(buffer_.size(), 4096);
     uptr read = internal_getdents(
-        descriptor_, (struct linux_dirent *)buffer_.data(), buffer_.size());
+        descriptor, (struct linux_dirent *)buffer_.data(), buffer_.size());
     if (!read)
       return result;
     if (internal_iserror(read)) {
-      Report("Can't read directory entries from /proc/%d/task.\n", pid_);
+      Report("Can't read directory entries from %s.\n", task_path_.data());
       return Error;
     }
 
@@ -1093,9 +1091,7 @@ ThreadLister::Result ThreadLister::ListThreads(
 bool ThreadLister::IsAlive(int tid) {
   // /proc/%d/task/%d/status uses same call to detect alive threads as
   // proc_task_readdir. See task_state implementation in Linux.
-  char path[80];
-  internal_snprintf(path, sizeof(path), "/proc/%d/task/%d/status", pid_, tid);
-  if (!ReadFileToVector(path, &buffer_) || buffer_.empty())
+  if (!ReadFileToVector(status_path_.data(), &buffer_) || buffer_.empty())
     return false;
   buffer_.push_back(0);
   static const char kPrefix[] = "\nPPid:";
@@ -1106,10 +1102,6 @@ bool ThreadLister::IsAlive(int tid) {
   return (int)internal_atoll(field) != 0;
 }
 
-ThreadLister::~ThreadLister() {
-  if (!internal_iserror(descriptor_))
-    internal_close(descriptor_);
-}
 #  endif
 
 #  if SANITIZER_WORDSIZE == 32
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux.h b/compiler-rt/lib/sanitizer_common/sanitizer_linux.h
index c30f0326793d5a..96c617822b5b27 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_linux.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux.h
@@ -97,7 +97,6 @@ uptr internal_clone(int (*fn)(void *), void *child_stack, int flags, void *arg);
 class ThreadLister {
  public:
   explicit ThreadLister(pid_t pid);
-  ~ThreadLister();
   enum Result {
     Error,
     Incomplete,
@@ -108,8 +107,8 @@ class ThreadLister {
  private:
   bool IsAlive(int tid);
 
-  pid_t pid_;
-  int descriptor_ = -1;
+  InternalScopedString task_path_;
+  InternalScopedString status_path_;
   InternalMmapVector<char> buffer_;
 };
 

From 69b0b7e7ac3adc42df517c25ed7017b5af9be9f1 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Thu, 10 Oct 2024 13:11:08 -0700
Subject: [PATCH 075/177] [lldb] Return an llvm::Error from GetFrameBaseValue
 (#111882)

This fixes the following assertion: "Cannot create Expected<T> from
Error success value." The problem was that GetFrameBaseValue return
false without updating the Status argument. This patch eliminates the
opportunity for mistakes by returning an llvm:Error.
---
 lldb/include/lldb/Target/StackFrame.h      | 10 +++-------
 lldb/source/Expression/DWARFExpression.cpp | 14 ++++++--------
 lldb/source/Target/StackFrame.cpp          | 13 ++++++-------
 3 files changed, 15 insertions(+), 22 deletions(-)

diff --git a/lldb/include/lldb/Target/StackFrame.h b/lldb/include/lldb/Target/StackFrame.h
index 5cc0fccee03b8f..fdbe1f567eabfa 100644
--- a/lldb/include/lldb/Target/StackFrame.h
+++ b/lldb/include/lldb/Target/StackFrame.h
@@ -195,14 +195,10 @@ class StackFrame : public ExecutionContextScope,
   /// \param [out] value
   ///   The address of the CFA for this frame, if available.
   ///
-  /// \param [out] error_ptr
-  ///   If there is an error determining the CFA address, this may contain a
-  ///   string explaining the failure.
-  ///
   /// \return
-  ///   Returns true if the CFA value was successfully set in value.  Some
-  ///   frames may be unable to provide this value; they will return false.
-  bool GetFrameBaseValue(Scalar &value, Status *error_ptr);
+  ///   If there is an error determining the CFA address, return an error
+  ///   explaining the failure. Success otherwise.
+  llvm::Error GetFrameBaseValue(Scalar &value);
 
   /// Get the DWARFExpressionList corresponding to the Canonical Frame Address.
   ///
diff --git a/lldb/source/Expression/DWARFExpression.cpp b/lldb/source/Expression/DWARFExpression.cpp
index 22d899f799d0fd..97bcd4f7eec26f 100644
--- a/lldb/source/Expression/DWARFExpression.cpp
+++ b/lldb/source/Expression/DWARFExpression.cpp
@@ -1780,14 +1780,12 @@ llvm::Expected<Value> DWARFExpression::Evaluate(
       if (exe_ctx) {
         if (frame) {
           Scalar value;
-          Status fb_err;
-          if (frame->GetFrameBaseValue(value, &fb_err)) {
-            int64_t fbreg_offset = opcodes.GetSLEB128(&offset);
-            value += fbreg_offset;
-            stack.push_back(value);
-            stack.back().SetValueType(Value::ValueType::LoadAddress);
-          } else
-            return fb_err.ToError();
+          if (llvm::Error err = frame->GetFrameBaseValue(value))
+            return err;
+          int64_t fbreg_offset = opcodes.GetSLEB128(&offset);
+          value += fbreg_offset;
+          stack.push_back(value);
+          stack.back().SetValueType(Value::ValueType::LoadAddress);
         } else {
           return llvm::createStringError(
               "invalid stack frame in context for DW_OP_fbreg opcode");
diff --git a/lldb/source/Target/StackFrame.cpp b/lldb/source/Target/StackFrame.cpp
index fe0d4c93c50627..ed493e35316137 100644
--- a/lldb/source/Target/StackFrame.cpp
+++ b/lldb/source/Target/StackFrame.cpp
@@ -1079,12 +1079,12 @@ ValueObjectSP StackFrame::GetValueForVariableExpressionPath(
   return valobj_sp;
 }
 
-bool StackFrame::GetFrameBaseValue(Scalar &frame_base, Status *error_ptr) {
+llvm::Error StackFrame::GetFrameBaseValue(Scalar &frame_base) {
   std::lock_guard<std::recursive_mutex> guard(m_mutex);
   if (!m_cfa_is_valid) {
     m_frame_base_error = Status::FromErrorString(
         "No frame base available for this historical stack frame.");
-    return false;
+    return m_frame_base_error.ToError();
   }
 
   if (m_flags.IsClear(GOT_FRAME_BASE)) {
@@ -1113,12 +1113,11 @@ bool StackFrame::GetFrameBaseValue(Scalar &frame_base, Status *error_ptr) {
     }
   }
 
-  if (m_frame_base_error.Success())
-    frame_base = m_frame_base;
+  if (m_frame_base_error.Fail())
+    return m_frame_base_error.ToError();
 
-  if (error_ptr)
-    *error_ptr = m_frame_base_error.Clone();
-  return m_frame_base_error.Success();
+  frame_base = m_frame_base;
+  return llvm::Error::success();
 }
 
 DWARFExpressionList *StackFrame::GetFrameBaseExpression(Status *error_ptr) {

From b3554265f24aa570bbc8693af8420a306c459f94 Mon Sep 17 00:00:00 2001
From: Chelsea Cassanova <chelsea_cassanova@apple.com>
Date: Thu, 10 Oct 2024 13:11:46 -0700
Subject: [PATCH 076/177] [lldb] Add include for SBLanguages in
 lldb-enumerations (#111907)

This adds an include for SBLanguages.h in lldb-enumerations.h so that
files that need this enum do not have to explicitly include SBLanguages.
---
 lldb/include/lldb/lldb-enumerations.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lldb/include/lldb/lldb-enumerations.h b/lldb/include/lldb/lldb-enumerations.h
index 232d1dfdb5c9d0..217cd7f65cc1c4 100644
--- a/lldb/include/lldb/lldb-enumerations.h
+++ b/lldb/include/lldb/lldb-enumerations.h
@@ -12,6 +12,8 @@
 #include <cstdint>
 #include <type_traits>
 
+#include <lldb/API/SBLanguages.h>
+
 #ifndef SWIG
 // Macro to enable bitmask operations on an enum.  Without this, Enum | Enum
 // gets promoted to an int, so you have to say Enum a = Enum(eFoo | eBar).  If

From 36bd9aebc428413a94f77e8daa679d1937dc2b63 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 10 Oct 2024 13:12:36 -0700
Subject: [PATCH 077/177] [sanitizer] VReport BeforeFork/AfterFork (#111900)

Forks are common suspects for unusual sanitizer behavior.
It can be handy to see them without rebuild.
---
 compiler-rt/lib/asan/asan_posix.cpp     | 2 ++
 compiler-rt/lib/dfsan/dfsan_custom.cpp  | 2 ++
 compiler-rt/lib/hwasan/hwasan_linux.cpp | 2 ++
 compiler-rt/lib/lsan/lsan_posix.cpp     | 2 ++
 compiler-rt/lib/msan/msan_linux.cpp     | 2 ++
 compiler-rt/lib/tsan/rtl/tsan_rtl.cpp   | 2 ++
 6 files changed, 12 insertions(+)

diff --git a/compiler-rt/lib/asan/asan_posix.cpp b/compiler-rt/lib/asan/asan_posix.cpp
index 4ee8d7d399e95c..39685696a0d0dd 100644
--- a/compiler-rt/lib/asan/asan_posix.cpp
+++ b/compiler-rt/lib/asan/asan_posix.cpp
@@ -149,6 +149,7 @@ void PlatformTSDDtor(void *tsd) {
 #  endif
 
 static void BeforeFork() {
+  VReport(2, "BeforeFork tid: %llu\n", GetTid());
   if (CAN_SANITIZE_LEAKS) {
     __lsan::LockGlobal();
   }
@@ -168,6 +169,7 @@ static void AfterFork(bool fork_child) {
   if (CAN_SANITIZE_LEAKS) {
     __lsan::UnlockGlobal();
   }
+  VReport(2, "AfterFork tid: %llu\n", GetTid());
 }
 
 void InstallAtForkHandler() {
diff --git a/compiler-rt/lib/dfsan/dfsan_custom.cpp b/compiler-rt/lib/dfsan/dfsan_custom.cpp
index 03147a79ed6543..dbc00d7ac3ea39 100644
--- a/compiler-rt/lib/dfsan/dfsan_custom.cpp
+++ b/compiler-rt/lib/dfsan/dfsan_custom.cpp
@@ -2859,6 +2859,7 @@ WRAPPER_ALIAS(__isoc99_sscanf, sscanf)
 WRAPPER_ALIAS(__isoc23_sscanf, sscanf)
 
 static void BeforeFork() {
+  VReport(2, "BeforeFork tid: %llu\n", GetTid());
   StackDepotLockBeforeFork();
   ChainedOriginDepotLockBeforeFork();
 }
@@ -2866,6 +2867,7 @@ static void BeforeFork() {
 static void AfterFork(bool fork_child) {
   ChainedOriginDepotUnlockAfterFork(fork_child);
   StackDepotUnlockAfterFork(fork_child);
+  VReport(2, "AfterFork tid: %llu\n", GetTid());
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
diff --git a/compiler-rt/lib/hwasan/hwasan_linux.cpp b/compiler-rt/lib/hwasan/hwasan_linux.cpp
index d174fb882ca483..68651d3d39d03e 100644
--- a/compiler-rt/lib/hwasan/hwasan_linux.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_linux.cpp
@@ -528,6 +528,7 @@ uptr TagMemoryAligned(uptr p, uptr size, tag_t tag) {
 }
 
 static void BeforeFork() {
+  VReport(2, "BeforeFork tid: %llu\n", GetTid());
   if (CAN_SANITIZE_LEAKS) {
     __lsan::LockGlobal();
   }
@@ -547,6 +548,7 @@ static void AfterFork(bool fork_child) {
   if (CAN_SANITIZE_LEAKS) {
     __lsan::UnlockGlobal();
   }
+  VReport(2, "AfterFork tid: %llu\n", GetTid());
 }
 
 void HwasanInstallAtForkHandler() {
diff --git a/compiler-rt/lib/lsan/lsan_posix.cpp b/compiler-rt/lib/lsan/lsan_posix.cpp
index ddd9fee07e89d2..593000b9eef991 100644
--- a/compiler-rt/lib/lsan/lsan_posix.cpp
+++ b/compiler-rt/lib/lsan/lsan_posix.cpp
@@ -97,6 +97,7 @@ void InstallAtExitCheckLeaks() {
 }
 
 static void BeforeFork() {
+  VReport(2, "BeforeFork tid: %llu\n", GetTid());
   LockGlobal();
   LockThreads();
   LockAllocator();
@@ -108,6 +109,7 @@ static void AfterFork(bool fork_child) {
   UnlockAllocator();
   UnlockThreads();
   UnlockGlobal();
+  VReport(2, "AfterFork tid: %llu\n", GetTid());
 }
 
 void InstallAtForkHandler() {
diff --git a/compiler-rt/lib/msan/msan_linux.cpp b/compiler-rt/lib/msan/msan_linux.cpp
index 894cf17002bbc0..7140de7e9c5432 100644
--- a/compiler-rt/lib/msan/msan_linux.cpp
+++ b/compiler-rt/lib/msan/msan_linux.cpp
@@ -302,6 +302,7 @@ void MsanTSDDtor(void *tsd) {
 #  endif
 
 static void BeforeFork() {
+  VReport(2, "BeforeFork tid: %llu\n", GetTid());
   // Usually we lock ThreadRegistry, but msan does not have one.
   LockAllocator();
   StackDepotLockBeforeFork();
@@ -313,6 +314,7 @@ static void AfterFork(bool fork_child) {
   StackDepotUnlockAfterFork(fork_child);
   UnlockAllocator();
   // Usually we unlock ThreadRegistry, but msan does not have one.
+  VReport(2, "AfterFork tid: %llu\n", GetTid());
 }
 
 void InstallAtForkHandler() {
diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp b/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp
index bf29aa316f6809..5a2d39cd30607f 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp
@@ -806,6 +806,7 @@ int Finalize(ThreadState *thr) {
 
 #if !SANITIZER_GO
 void ForkBefore(ThreadState* thr, uptr pc) SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
+  VReport(2, "BeforeFork tid: %llu\n", GetTid());
   GlobalProcessorLock();
   // Detaching from the slot makes OnUserFree skip writing to the shadow.
   // The slot will be locked so any attempts to use it will deadlock anyway.
@@ -847,6 +848,7 @@ static void ForkAfter(ThreadState* thr,
   SlotAttachAndLock(thr);
   SlotUnlock(thr);
   GlobalProcessorUnlock();
+  VReport(2, "AfterFork tid: %llu\n", GetTid());
 }
 
 void ForkParentAfter(ThreadState* thr, uptr pc) { ForkAfter(thr, false); }

From 86f78c0093100016bcb0299d1b7828c2d30e3a56 Mon Sep 17 00:00:00 2001
From: Alexis Perry-Holby <aperry@lanl.gov>
Date: Thu, 10 Oct 2024 14:21:21 -0600
Subject: [PATCH 078/177] [flang] Add a link to the ICS file for the Biweekly
 Flang Community Call

---
 flang/docs/GettingInvolved.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/flang/docs/GettingInvolved.md b/flang/docs/GettingInvolved.md
index f583d934ff2bfb..a8bd93517709dd 100644
--- a/flang/docs/GettingInvolved.md
+++ b/flang/docs/GettingInvolved.md
@@ -49,6 +49,7 @@ To understand the status of various developments in Flang please join the respec
     - If you prefer to join using a meeting number and password, those can be 
       found in this [Google Doc](https://docs.google.com/document/d/1Z2U5UAtJ-Dag5wlMaLaW1KRmNgENNAYynJqLW2j2AZQ/). Alternative methods of joining, such as call-in numbers, are also available.
 -   Time: Wednesdays, 8:30 a.m. Pacific Time, on the weeks alternating with regular Flang Community Technical Biweekly Call.
+-   Calendar invite: https://drive.google.com/file/d/1rkfWCtIvQFcxN0Uz8YVwQGoX_BbzT8oc/view?usp=drive_link
 -   Meeting minutes are available in this [Google Doc](https://docs.google.com/document/d/1Z2U5UAtJ-Dag5wlMaLaW1KRmNgENNAYynJqLW2j2AZQ/edit).
 -   Minutes from older meetings were posted on the [Flang forum](https://discourse.llvm.org/c/subprojects/flang). Search for `Flang Biweekly Sync - Notes`.
 

From b77fdf5799be6b29869f2f7969851709e03938ba Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Thu, 10 Oct 2024 13:22:56 -0700
Subject: [PATCH 079/177] [lldb] SetErrorStringWithFormatv ->
 FromErrorStringWithFormatv (NFC)

---
 lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp
index 116c43343c01d1..367fce442bb866 100644
--- a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp
+++ b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp
@@ -407,8 +407,9 @@ Status ProcessKDP::DoResume(RunDirection direction) {
   Log *log = GetLog(KDPLog::Process);
 
   if (direction == RunDirection::eRunReverse) {
-    error.SetErrorStringWithFormatv(
-        "error: {0} does not support reverse execution of processes", GetPluginName());
+    error.FromErrorStringWithFormatv(
+        "error: {0} does not support reverse execution of processes",
+        GetPluginName());
     return error;
   }
 

From 69c0067927293bff1401a9a050081e83dbefd282 Mon Sep 17 00:00:00 2001
From: vporpo <vporpodas@google.com>
Date: Thu, 10 Oct 2024 13:25:03 -0700
Subject: [PATCH 080/177] [SandboxVec][DAG] Refactoring: Outline code that
 looks for mem nodes (#111750)

---
 .../SandboxVectorizer/DependencyGraph.h       |  8 ++++
 .../SandboxVectorizer/DependencyGraph.cpp     | 42 ++++++++++++++-----
 .../SandboxVectorizer/DependencyGraphTest.cpp | 14 +++++++
 3 files changed, 53 insertions(+), 11 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h
index da50e5326ea069..7d300ea2b60d2d 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h
@@ -231,6 +231,14 @@ class MemDGNode final : public DGNode {
 /// Convenience builders for a MemDGNode interval.
 class MemDGNodeIntervalBuilder {
 public:
+  /// Scans the instruction chain in \p Intvl top-down, returning the top-most
+  /// MemDGNode, or nullptr.
+  static MemDGNode *getTopMemDGNode(const Interval<Instruction> &Intvl,
+                                    const DependencyGraph &DAG);
+  /// Scans the instruction chain in \p Intvl bottom-up, returning the
+  /// bottom-most MemDGNode, or nullptr.
+  static MemDGNode *getBotMemDGNode(const Interval<Instruction> &Intvl,
+                                    const DependencyGraph &DAG);
   /// Given \p Instrs it finds their closest mem nodes in the interval and
   /// returns the corresponding mem range. Note: BotN (or its neighboring mem
   /// node) is included in the range.
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp
index 70843812ff65bc..0cd2240e7ff1b3 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp
@@ -78,23 +78,43 @@ void MemDGNode::print(raw_ostream &OS, bool PrintDeps) const {
 }
 #endif // NDEBUG
 
+MemDGNode *
+MemDGNodeIntervalBuilder::getTopMemDGNode(const Interval<Instruction> &Intvl,
+                                          const DependencyGraph &DAG) {
+  Instruction *I = Intvl.top();
+  Instruction *BeforeI = Intvl.bottom();
+  // Walk down the chain looking for a mem-dep candidate instruction.
+  while (!DGNode::isMemDepNodeCandidate(I) && I != BeforeI)
+    I = I->getNextNode();
+  if (!DGNode::isMemDepNodeCandidate(I))
+    return nullptr;
+  return cast<MemDGNode>(DAG.getNode(I));
+}
+
+MemDGNode *
+MemDGNodeIntervalBuilder::getBotMemDGNode(const Interval<Instruction> &Intvl,
+                                          const DependencyGraph &DAG) {
+  Instruction *I = Intvl.bottom();
+  Instruction *AfterI = Intvl.top();
+  // Walk up the chain looking for a mem-dep candidate instruction.
+  while (!DGNode::isMemDepNodeCandidate(I) && I != AfterI)
+    I = I->getPrevNode();
+  if (!DGNode::isMemDepNodeCandidate(I))
+    return nullptr;
+  return cast<MemDGNode>(DAG.getNode(I));
+}
+
 Interval<MemDGNode>
 MemDGNodeIntervalBuilder::make(const Interval<Instruction> &Instrs,
                                DependencyGraph &DAG) {
-  // If top or bottom instructions are not mem-dep candidate nodes we need to
-  // walk down/up the chain and find the mem-dep ones.
-  Instruction *MemTopI = Instrs.top();
-  Instruction *MemBotI = Instrs.bottom();
-  while (!DGNode::isMemDepNodeCandidate(MemTopI) && MemTopI != MemBotI)
-    MemTopI = MemTopI->getNextNode();
-  while (!DGNode::isMemDepNodeCandidate(MemBotI) && MemBotI != MemTopI)
-    MemBotI = MemBotI->getPrevNode();
+  auto *TopMemN = getTopMemDGNode(Instrs, DAG);
   // If we couldn't find a mem node in range TopN - BotN then it's empty.
-  if (!DGNode::isMemDepNodeCandidate(MemTopI))
+  if (TopMemN == nullptr)
     return {};
+  auto *BotMemN = getBotMemDGNode(Instrs, DAG);
+  assert(BotMemN != nullptr && "TopMemN should be null too!");
   // Now that we have the mem-dep nodes, create and return the range.
-  return Interval<MemDGNode>(cast<MemDGNode>(DAG.getNode(MemTopI)),
-                             cast<MemDGNode>(DAG.getNode(MemBotI)));
+  return Interval<MemDGNode>(TopMemN, BotMemN);
 }
 
 DependencyGraph::DependencyType
diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp
index 5a9c9815ca42fa..7e2be25fa25ae6 100644
--- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp
@@ -347,6 +347,20 @@ define void @foo(ptr %ptr, i8 %v0, i8 %v1) {
   auto *S0N = cast<sandboxir::MemDGNode>(DAG.getNode(S0));
   auto *S1N = cast<sandboxir::MemDGNode>(DAG.getNode(S1));
 
+  // Check getTopMemDGNode().
+  using B = sandboxir::MemDGNodeIntervalBuilder;
+  using InstrInterval = sandboxir::Interval<sandboxir::Instruction>;
+  EXPECT_EQ(B::getTopMemDGNode(InstrInterval(S0, S0), DAG), S0N);
+  EXPECT_EQ(B::getTopMemDGNode(InstrInterval(S0, Ret), DAG), S0N);
+  EXPECT_EQ(B::getTopMemDGNode(InstrInterval(Add0, Add1), DAG), S0N);
+  EXPECT_EQ(B::getTopMemDGNode(InstrInterval(Add0, Add0), DAG), nullptr);
+
+  // Check getBotMemDGNode().
+  EXPECT_EQ(B::getBotMemDGNode(InstrInterval(S1, S1), DAG), S1N);
+  EXPECT_EQ(B::getBotMemDGNode(InstrInterval(Add0, S1), DAG), S1N);
+  EXPECT_EQ(B::getBotMemDGNode(InstrInterval(Add0, Ret), DAG), S1N);
+  EXPECT_EQ(B::getBotMemDGNode(InstrInterval(Ret, Ret), DAG), nullptr);
+
   // Check empty range.
   EXPECT_THAT(sandboxir::MemDGNodeIntervalBuilder::makeEmpty(),
               testing::ElementsAre());

From 195486950fa64938e62f6d85d31222fa41d0ee09 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 10 Oct 2024 13:25:46 -0700
Subject: [PATCH 081/177] [NFC][sanitizer] Fix at_scope_exit name.

---
 compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
index 70fd9405e5454f..e5d6d0a6e71649 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
@@ -1037,7 +1037,7 @@ ThreadLister::Result ThreadLister::ListThreads(
     Report("Can't open %s for reading.\n", task_path_.data());
     return Error;
   }
-  auto acts_cleanup = at_scope_exit([&] { internal_close(descriptor); });
+  auto cleanup = at_scope_exit([&] { internal_close(descriptor); });
   threads->clear();
 
   Result result = Ok;

From 4b5018d2311596778cade4db5177e2ab879cc218 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Thu, 10 Oct 2024 13:40:22 -0700
Subject: [PATCH 082/177] [SLP]Track repeated reduced value as it might be
 vectorized

Need to track changes with the repeated reduced value, since it might be
vectorized in the next attempt for reduction vectorization, to correctly
generate the code and avoid compiler crash.

Fixes #111887
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 22 ++++++-----
 .../reduced-value-repeated-and-vectorized.ll  | 37 +++++++++++++++++++
 2 files changed, 49 insertions(+), 10 deletions(-)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/RISCV/reduced-value-repeated-and-vectorized.ll

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 94de520a2715ff..e2958c49b8ca9f 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1293,8 +1293,7 @@ class BoUpSLP {
   using InstrList = SmallVector<Instruction *, 16>;
   using ValueSet = SmallPtrSet<Value *, 16>;
   using StoreList = SmallVector<StoreInst *, 8>;
-  using ExtraValueToDebugLocsMap =
-      MapVector<Value *, SmallVector<Instruction *, 2>>;
+  using ExtraValueToDebugLocsMap = SmallDenseSet<Value *, 4>;
   using OrdersType = SmallVector<unsigned, 4>;
 
   BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
@@ -6322,7 +6321,7 @@ void BoUpSLP::buildExternalUses(
         continue;
 
       // Check if the scalar is externally used as an extra arg.
-      const auto *ExtI = ExternallyUsedValues.find(Scalar);
+      const auto ExtI = ExternallyUsedValues.find(Scalar);
       if (ExtI != ExternallyUsedValues.end()) {
         int FoundLane = Entry->findLaneForValue(Scalar);
         LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
@@ -18820,7 +18819,7 @@ class HorizontalReduction {
     // List of the values that were reduced in other trees as part of gather
     // nodes and thus requiring extract if fully vectorized in other trees.
     SmallPtrSet<Value *, 4> RequiredExtract;
-    Value *VectorizedTree = nullptr;
+    WeakTrackingVH VectorizedTree = nullptr;
     bool CheckForReusedReductionOps = false;
     // Try to vectorize elements based on their type.
     SmallVector<InstructionsState> States;
@@ -18916,6 +18915,7 @@ class HorizontalReduction {
       bool SameScaleFactor = false;
       bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
                               SameValuesCounter.size() != Candidates.size();
+      BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
       if (OptReusedScalars) {
         SameScaleFactor =
             (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
@@ -18936,6 +18936,7 @@ class HorizontalReduction {
               emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
           VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
           VectorizedVals.try_emplace(OrigV, Cnt);
+          ExternallyUsedValues.insert(OrigV);
           continue;
         }
       }
@@ -19015,17 +19016,18 @@ class HorizontalReduction {
         V.reorderBottomToTop(/*IgnoreReorder=*/true);
         // Keep extracted other reduction values, if they are used in the
         // vectorization trees.
-        BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues;
+        BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
+            ExternallyUsedValues);
         // The reduction root is used as the insertion point for new
         // instructions, so set it as externally used to prevent it from being
         // deleted.
-        LocalExternallyUsedValues[ReductionRoot];
+        LocalExternallyUsedValues.insert(ReductionRoot);
         for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
           if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
             continue;
           for (Value *V : ReducedVals[Cnt])
             if (isa<Instruction>(V))
-              LocalExternallyUsedValues[TrackedVals[V]];
+              LocalExternallyUsedValues.insert(TrackedVals[V]);
         }
         if (!IsSupportedHorRdxIdentityOp) {
           // Number of uses of the candidates in the vector of values.
@@ -19054,21 +19056,21 @@ class HorizontalReduction {
           // Check if the scalar was vectorized as part of the vectorization
           // tree but not the top node.
           if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
-            LocalExternallyUsedValues[RdxVal];
+            LocalExternallyUsedValues.insert(RdxVal);
             continue;
           }
           Value *OrigV = TrackedToOrig.at(RdxVal);
           unsigned NumOps =
               VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
           if (NumOps != ReducedValsToOps.at(OrigV).size())
-            LocalExternallyUsedValues[RdxVal];
+            LocalExternallyUsedValues.insert(RdxVal);
         }
         // Do not need the list of reused scalars in regular mode anymore.
         if (!IsSupportedHorRdxIdentityOp)
           SameValuesCounter.clear();
         for (Value *RdxVal : VL)
           if (RequiredExtract.contains(RdxVal))
-            LocalExternallyUsedValues[RdxVal];
+            LocalExternallyUsedValues.insert(RdxVal);
         V.buildExternalUses(LocalExternallyUsedValues);
 
         V.computeMinimumValueSizes();
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reduced-value-repeated-and-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reduced-value-repeated-and-vectorized.ll
new file mode 100644
index 00000000000000..d5e1a110c6277c
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reduced-value-repeated-and-vectorized.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -mtriple=riscv64-unknown-linux-gnu -mattr=+v < %s | FileCheck %s
+
+define void @test() {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i16> @llvm.experimental.vp.strided.load.v4i16.p0.i64(ptr align 2 null, i64 6, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 4)
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr null, align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i16> [[TMP0]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i16 @llvm.smax.i16(i16 [[TMP1]], i16 [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call i16 @llvm.smax.i16(i16 [[TMP4]], i16 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call i16 @llvm.smax.i16(i16 [[TMP5]], i16 0)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load i16, ptr null, align 2
+  %1 = xor i16 %0, 0
+  %2 = tail call i16 @llvm.smax.i16(i16 %1, i16 %0)
+  %3 = tail call i16 @llvm.smax.i16(i16 0, i16 %2)
+  %4 = load i16, ptr getelementptr inbounds (i8, ptr null, i64 6), align 2
+  %5 = xor i16 %4, 0
+  %6 = tail call i16 @llvm.smax.i16(i16 %5, i16 %0)
+  %7 = tail call i16 @llvm.smax.i16(i16 %3, i16 %6)
+  %8 = load i16, ptr getelementptr (i8, ptr null, i64 12), align 2
+  %9 = xor i16 %8, 0
+  %10 = tail call i16 @llvm.smax.i16(i16 %9, i16 %0)
+  %11 = tail call i16 @llvm.smax.i16(i16 %7, i16 %10)
+  %12 = load i16, ptr getelementptr (i8, ptr null, i64 18), align 2
+  %13 = xor i16 %12, 0
+  %14 = tail call i16 @llvm.smax.i16(i16 %13, i16 %0)
+  %15 = tail call i16 @llvm.smax.i16(i16 %11, i16 %14)
+  %16 = tail call i16 @llvm.smax.i16(i16 %15, i16 0)
+  ret void
+}
+

From 16ef893e9fdec2b08dafc82f5450b41834e09039 Mon Sep 17 00:00:00 2001
From: Wael Yehia <wyehia@ca.ibm.com>
Date: Wed, 9 Oct 2024 18:06:56 +0000
Subject: [PATCH 083/177] [test] env -u is not supported on AIX, use `unset`
 instead

---
 compiler-rt/test/profile/instrprof-tmpdir.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/compiler-rt/test/profile/instrprof-tmpdir.c b/compiler-rt/test/profile/instrprof-tmpdir.c
index 6f323e7e6a01a5..7206df3c2eb0c6 100644
--- a/compiler-rt/test/profile/instrprof-tmpdir.c
+++ b/compiler-rt/test/profile/instrprof-tmpdir.c
@@ -12,7 +12,8 @@
 // RUN: llvm-profdata show ./raw2.profraw | FileCheck %s -check-prefix TMPDIR
 //
 // Check that we fall back to the default path if TMPDIR is missing.
-// RUN: env -u TMPDIR LLVM_PROFILE_FILE="%%t/raw3.profraw" %run %t/binary 2>&1 | FileCheck %s -check-prefix MISSING
+// RUN: %if system-aix %{ unset TMPDIR %}
+// RUN: env %if !system-aix %{ -u TMPDIR %} LLVM_PROFILE_FILE="%%t/raw3.profraw" %run %t/binary 2>&1 | FileCheck %s -check-prefix MISSING
 // RUN: llvm-profdata show ./default.profraw | FileCheck %s -check-prefix TMPDIR
 
 // TMPDIR: Maximum function count: 1

From c99b36554745837c549e1b46cd60db70588affcf Mon Sep 17 00:00:00 2001
From: Jason Molenda <jmolenda@apple.com>
Date: Thu, 10 Oct 2024 14:50:34 -0700
Subject: [PATCH 084/177] Revert "[lldb] Add include for SBLanguages in
 lldb-enumerations (#111907)"

Temporarily Revert until Chelsea can look at this.  With a clean build,
SBLanguages.h won't be generated in the build directory at the point
when it is included by lldb-enumerations when compiling e.g.
Broadcaster.cpp.  On a clean build (no pre-existing build directory),
the dependency ordering is not explicitly stated so the build will fail.
An incremental build will succeed.

This reverts commit b3554265f24aa570bbc8693af8420a306c459f94.
---
 lldb/include/lldb/lldb-enumerations.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/lldb/include/lldb/lldb-enumerations.h b/lldb/include/lldb/lldb-enumerations.h
index 217cd7f65cc1c4..232d1dfdb5c9d0 100644
--- a/lldb/include/lldb/lldb-enumerations.h
+++ b/lldb/include/lldb/lldb-enumerations.h
@@ -12,8 +12,6 @@
 #include <cstdint>
 #include <type_traits>
 
-#include <lldb/API/SBLanguages.h>
-
 #ifndef SWIG
 // Macro to enable bitmask operations on an enum.  Without this, Enum | Enum
 // gets promoted to an int, so you have to say Enum a = Enum(eFoo | eBar).  If

From 5deadc6eaede3d32ccdd68529f371092d4d218da Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 10 Oct 2024 14:52:50 -0700
Subject: [PATCH 085/177] [NFC][sanitizer] Extract `LoadStatus` (#111909)

For #111901
---
 .../lib/sanitizer_common/sanitizer_linux.cpp  | 21 +++++++++++++------
 .../lib/sanitizer_common/sanitizer_linux.h    |  1 +
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
index e5d6d0a6e71649..a4e58133c79f08 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
@@ -1042,8 +1042,6 @@ ThreadLister::Result ThreadLister::ListThreads(
 
   Result result = Ok;
   for (bool first_read = true;; first_read = false) {
-    // Resize to max capacity if it was downsized by IsAlive.
-    buffer_.resize(buffer_.capacity());
     CHECK_GE(buffer_.size(), 4096);
     uptr read = internal_getdents(
         descriptor, (struct linux_dirent *)buffer_.data(), buffer_.size());
@@ -1088,14 +1086,25 @@ ThreadLister::Result ThreadLister::ListThreads(
   }
 }
 
+const char *ThreadLister::LoadStatus(int tid) {
+  auto cleanup = at_scope_exit([&] {
+    // Resize back to capacity if it is downsized by `ReadFileToVector`.
+    buffer_.resize(buffer_.capacity());
+  });
+  if (!ReadFileToVector(status_path_.data(), &buffer_) || buffer_.empty())
+    return nullptr;
+  buffer_.push_back('\0');
+  return buffer_.data();
+}
+
 bool ThreadLister::IsAlive(int tid) {
   // /proc/%d/task/%d/status uses same call to detect alive threads as
   // proc_task_readdir. See task_state implementation in Linux.
-  if (!ReadFileToVector(status_path_.data(), &buffer_) || buffer_.empty())
-    return false;
-  buffer_.push_back(0);
   static const char kPrefix[] = "\nPPid:";
-  const char *field = internal_strstr(buffer_.data(), kPrefix);
+  const char *status = LoadStatus(tid);
+  if (!status)
+    return false;
+  const char *field = internal_strstr(status, kPrefix);
   if (!field)
     return false;
   field += internal_strlen(kPrefix);
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux.h b/compiler-rt/lib/sanitizer_common/sanitizer_linux.h
index 96c617822b5b27..07d9528813b3fe 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_linux.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux.h
@@ -103,6 +103,7 @@ class ThreadLister {
     Ok,
   };
   Result ListThreads(InternalMmapVector<tid_t> *threads);
+  const char *LoadStatus(int tid);
 
  private:
   bool IsAlive(int tid);

From af7fa2710c998811dd72799799798f2bd4d9bff4 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 10 Oct 2024 14:53:16 -0700
Subject: [PATCH 086/177] [sanitizer] VReport thread status for failed
 PTRACE_ATTACH (#111901)

Such threads can cause false leak reports,
but often it's hard to diagnose the reason of
failed PTRACE_ATTACH. Maybe we can find
a clue from `/proc/*/task/*/status`
---
 .../sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp   | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp
index d9f803a276dadc..6ebca965f6a334 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp
@@ -228,6 +228,8 @@ bool ThreadSuspender::SuspendAllThreads() {
     for (tid_t tid : threads) {
       if (SuspendThread(tid))
         retry = true;
+      else
+        VReport(2, "%llu/status: %s\n", tid, thread_lister.LoadStatus(tid));
     }
     if (retry)
       VReport(1, "SuspendAllThreads retry: %d\n", i);

From 48545a955c4e61f42833af7417032d816482bdfc Mon Sep 17 00:00:00 2001
From: William Junda Huang <williamjhuang@google.com>
Date: Thu, 10 Oct 2024 17:59:44 -0400
Subject: [PATCH 087/177] [ThinLTO] Do not duplicate import a function that is
 actually defined in the current module (#110064)

Doing so could cause a bug where the linker tries to remap a function
"reimported" from the current module when materializing it, causing a
lookup assert in the type mappings.
---
 llvm/lib/Linker/IRMover.cpp                   |  6 +-
 .../Inputs/ditemplatevalueparameter-remap.ll  | 29 +++++++
 .../X86/ditemplatevalueparameter-remap.ll     | 87 +++++++++++++++++++
 3 files changed, 121 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/ThinLTO/X86/Inputs/ditemplatevalueparameter-remap.ll
 create mode 100644 llvm/test/ThinLTO/X86/ditemplatevalueparameter-remap.ll

diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp
index 3a6c2678cd157f..5067fbff2e277b 100644
--- a/llvm/lib/Linker/IRMover.cpp
+++ b/llvm/lib/Linker/IRMover.cpp
@@ -595,11 +595,15 @@ Value *IRLinker::materialize(Value *V, bool ForIndirectSymbol) {
   if (!SGV)
     return nullptr;
 
+  // If SGV is from dest, it was already materialized when dest was loaded.
+  if (SGV->getParent() == &DstM)
+    return nullptr;
+
   // When linking a global from other modules than source & dest, skip
   // materializing it because it would be mapped later when its containing
   // module is linked. Linking it now would potentially pull in many types that
   // may not be mapped properly.
-  if (SGV->getParent() != &DstM && SGV->getParent() != SrcM.get())
+  if (SGV->getParent() != SrcM.get())
     return nullptr;
 
   Expected<Constant *> NewProto = linkGlobalValueProto(SGV, ForIndirectSymbol);
diff --git a/llvm/test/ThinLTO/X86/Inputs/ditemplatevalueparameter-remap.ll b/llvm/test/ThinLTO/X86/Inputs/ditemplatevalueparameter-remap.ll
new file mode 100644
index 00000000000000..be93160b943397
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/Inputs/ditemplatevalueparameter-remap.ll
@@ -0,0 +1,29 @@
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @_Z8thinlto1v() unnamed_addr {
+  %3 = alloca i64, align 4
+    #dbg_declare(ptr %3, !14, !DIExpression(), !15)
+  ret void
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "B.cpp", directory: ".")
+!2 = !{i32 7, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{i32 8, !"PIC Level", i32 2}
+!10 = distinct !DISubprogram(name: "thinlto1", linkageName: "_Z8thinlto1v", scope: !11, file: !11, line: 8, type: !12, scopeLine: 8, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!11 = !DIFile(filename: "b.cpp", directory: ".")
+!12 = !DISubroutineType(types: !13)
+!13 = !{null}
+!14 = !DILocalVariable(name: "a", arg: 1, scope: !10, file: !11, line: 18, type: !16)
+!15 = !DILocation(line: 18, column: 19, scope: !10)
+!16 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "S<&func1>", file: !11, line: 2, size: 8, flags: DIFlagTypePassByValue, elements: !17, templateParams: !18, identifier: "_ZTS1SIXadL_Z5func1vEEE")
+!17 = !{}
+!18 = !{!19}
+!19 = !DITemplateValueParameter(name: "Func", type: !20, value: ptr undef)
+!20 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !12, size: 64)
diff --git a/llvm/test/ThinLTO/X86/ditemplatevalueparameter-remap.ll b/llvm/test/ThinLTO/X86/ditemplatevalueparameter-remap.ll
new file mode 100644
index 00000000000000..0651705ccba8b8
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/ditemplatevalueparameter-remap.ll
@@ -0,0 +1,87 @@
+; https://github.com/llvm/llvm-project/pull/110064
+; This test case checks if thinLTO correctly links metadata values in a specific
+; situation. Assume we are linking module B into module A, where an extern
+; function used in A is defined in B, but the function body has a
+; DITemplateValueParameter referring to another function back in A. The
+; compiler must check this other function is actually coming from A, thus
+; already materialized and does not require remapping. The IR here is modified
+; from the following source code.
+;
+; // A.h
+; template <void (*Func)()>
+; struct S {
+;   void Impl() {
+;     Func();
+;   }
+; };
+;
+; void func1();
+;
+; // A.cpp
+; #include "A.h"
+; __attribute__((weak)) void func1() {}
+; extern void thinlto1();
+; void bar() {
+;   S<func1> s; // Force instantiation of S<func1> in this compilation unit.
+;   s.Impl();
+;   thinlto1();
+; }
+;
+; // B.cpp
+; #include "A.h"
+; void thinlto1() {
+;   S<func1> s;
+; }
+;
+; RUN: opt -module-summary -o %t1.bc %s
+; RUN: opt -module-summary -o %t2.bc %S/Inputs/ditemplatevalueparameter-remap.ll
+; RUN: ld.lld --plugin-opt=thinlto-index-only -shared %t1.bc %t2.bc
+; RUN: clang -O3 -fthinlto-index=%t1.bc.thinlto.bc -x ir %t1.bc -S -emit-llvm -o - | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+$_Z5func1v = comdat any
+
+define linkonce_odr dso_local void @_Z5func1v() unnamed_addr !dbg !10 {
+  ret void
+}
+
+; Dummy function to use _Z5func1v so that it is not treated as dead symbol.
+define void @_Z3bazv() {
+  tail call void @_Z5func1v()
+  ret void
+}
+
+declare void @_Z8thinlto1v() unnamed_addr
+
+; CHECK: void @_Z3barv()
+; CHECK-NOT: call void @_Z8thinlto1v()
+; CHECK-NEXT: ret void
+define void @_Z3barv() unnamed_addr !dbg !14 {
+  tail call void @_Z8thinlto1v(), !dbg !25
+  ret void
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "A.cpp", directory: ".")
+!2 = !{i32 7, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{i32 8, !"PIC Level", i32 2}
+!10 = distinct !DISubprogram(name: "func1", linkageName: "_Z5func1v", scope: !11, file: !11, line: 6, type: !12, scopeLine: 6, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!11 = !DIFile(filename: "a.h", directory: ".")
+!12 = !DISubroutineType(types: !13)
+!13 = !{null}
+!14 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !11, file: !11, line: 15, type: !12, scopeLine: 15, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !16)
+!16 = !{!17}
+!17 = !DILocalVariable(name: "s", scope: !14, file: !11, line: 10, type: !18)
+!18 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "S<&func1>", file: !11, line: 2, size: 8, flags: DIFlagTypePassByValue, elements: !19, templateParams: !20, identifier: "_ZTS1SIXadL_Z5func1vEEE")
+!19 = !{}
+!20 = !{!21}
+!21 = !DITemplateValueParameter(name: "Func", type: !22, value: ptr @_Z5func1v)
+!22 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !12, size: 64)
+!25 = !DILocation(line: 16, column: 5, scope: !14)

From f02252e1fd2965db007cf7be74c448b7a119c321 Mon Sep 17 00:00:00 2001
From: Augusto Noronha <anoronha@apple.com>
Date: Thu, 10 Oct 2024 15:01:13 -0700
Subject: [PATCH 088/177] Revert "[lldb] SetErrorStringWithFormatv ->
 FromErrorStringWithFormatv (NFC)"

This reverts commit b77fdf5799be6b29869f2f7969851709e03938ba.
---
 lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp
index 367fce442bb866..116c43343c01d1 100644
--- a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp
+++ b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp
@@ -407,9 +407,8 @@ Status ProcessKDP::DoResume(RunDirection direction) {
   Log *log = GetLog(KDPLog::Process);
 
   if (direction == RunDirection::eRunReverse) {
-    error.FromErrorStringWithFormatv(
-        "error: {0} does not support reverse execution of processes",
-        GetPluginName());
+    error.SetErrorStringWithFormatv(
+        "error: {0} does not support reverse execution of processes", GetPluginName());
     return error;
   }
 

From 2ff4c25b7efff64b3b662d0bedcfe7edebcf20b9 Mon Sep 17 00:00:00 2001
From: Augusto Noronha <anoronha@apple.com>
Date: Thu, 10 Oct 2024 15:01:20 -0700
Subject: [PATCH 089/177] Revert "[lldb] Implement basic support for
 reverse-continue (#99736)"

This reverts commit d5e1de6da96c1ab3b8cae68447e8ed3696a7006e.
---
 lldb/include/lldb/API/SBProcess.h             |   1 -
 lldb/include/lldb/Target/Process.h            |  21 +-
 lldb/include/lldb/Target/StopInfo.h           |   6 -
 lldb/include/lldb/lldb-enumerations.h         |   6 -
 .../Python/lldbsuite/test/gdbclientutils.py   |   5 +-
 .../Python/lldbsuite/test/lldbgdbproxy.py     | 175 --------
 .../Python/lldbsuite/test/lldbreverse.py      | 418 ------------------
 .../Python/lldbsuite/test/lldbtest.py         |   2 -
 lldb/source/API/SBProcess.cpp                 |   8 +-
 lldb/source/API/SBThread.cpp                  |   2 -
 .../source/Interpreter/CommandInterpreter.cpp |   3 +-
 .../Process/Linux/NativeThreadLinux.cpp       |   3 -
 .../Process/MacOSX-Kernel/ProcessKDP.cpp      |   9 +-
 .../Process/MacOSX-Kernel/ProcessKDP.h        |   2 +-
 .../Process/Windows/Common/ProcessWindows.cpp |   8 +-
 .../Process/Windows/Common/ProcessWindows.h   |   2 +-
 .../GDBRemoteCommunicationClient.cpp          |  22 -
 .../gdb-remote/GDBRemoteCommunicationClient.h |   6 -
 .../GDBRemoteCommunicationServerLLGS.cpp      |   1 -
 .../Process/gdb-remote/ProcessGDBRemote.cpp   |  77 +---
 .../Process/gdb-remote/ProcessGDBRemote.h     |   2 +-
 .../Process/scripted/ScriptedProcess.cpp      |   9 +-
 .../Process/scripted/ScriptedProcess.h        |   2 +-
 lldb/source/Target/Process.cpp                |  29 +-
 lldb/source/Target/StopInfo.cpp               |  29 --
 lldb/source/Target/Thread.cpp                 |   8 +-
 .../reverse-execution/Makefile                |   3 -
 .../TestReverseContinueBreakpoints.py         | 115 -----
 .../TestReverseContinueNotSupported.py        |  30 --
 .../functionalities/reverse-execution/main.c  |  14 -
 lldb/tools/lldb-dap/JSONUtils.cpp             |   3 -
 lldb/tools/lldb-dap/LLDBUtils.cpp             |   1 -
 32 files changed, 44 insertions(+), 978 deletions(-)
 delete mode 100644 lldb/packages/Python/lldbsuite/test/lldbgdbproxy.py
 delete mode 100644 lldb/packages/Python/lldbsuite/test/lldbreverse.py
 delete mode 100644 lldb/test/API/functionalities/reverse-execution/Makefile
 delete mode 100644 lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py
 delete mode 100644 lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py
 delete mode 100644 lldb/test/API/functionalities/reverse-execution/main.c

diff --git a/lldb/include/lldb/API/SBProcess.h b/lldb/include/lldb/API/SBProcess.h
index 8b8ed830b54cc0..1624e02070b1b2 100644
--- a/lldb/include/lldb/API/SBProcess.h
+++ b/lldb/include/lldb/API/SBProcess.h
@@ -159,7 +159,6 @@ class LLDB_API SBProcess {
   lldb::SBError Destroy();
 
   lldb::SBError Continue();
-  lldb::SBError Continue(RunDirection direction);
 
   lldb::SBError Stop();
 
diff --git a/lldb/include/lldb/Target/Process.h b/lldb/include/lldb/Target/Process.h
index fe7fbc50fd5770..b8c53a474ba6b9 100644
--- a/lldb/include/lldb/Target/Process.h
+++ b/lldb/include/lldb/Target/Process.h
@@ -857,10 +857,10 @@ class Process : public std::enable_shared_from_this<Process>,
   /// \see Thread:Resume()
   /// \see Thread:Step()
   /// \see Thread:Suspend()
-  Status Resume(lldb::RunDirection direction = lldb::eRunForward);
+  Status Resume();
 
   /// Resume a process, and wait for it to stop.
-  Status ResumeSynchronous(Stream *stream, lldb::RunDirection direction = lldb::eRunForward);
+  Status ResumeSynchronous(Stream *stream);
 
   /// Halts a running process.
   ///
@@ -1104,14 +1104,9 @@ class Process : public std::enable_shared_from_this<Process>,
   /// \see Thread:Resume()
   /// \see Thread:Step()
   /// \see Thread:Suspend()
-  virtual Status DoResume(lldb::RunDirection direction) {
-    if (direction == lldb::RunDirection::eRunForward) {
-      return Status::FromErrorStringWithFormatv(
-          "error: {0} does not support resuming processes", GetPluginName());
-    } else {
-      return Status::FromErrorStringWithFormatv(
-          "error: {0} does not support reverse execution of processes", GetPluginName());
-    }
+  virtual Status DoResume() {
+    return Status::FromErrorStringWithFormatv(
+        "error: {0} does not support resuming processes", GetPluginName());
   }
 
   /// Called after resuming a process.
@@ -2337,8 +2332,6 @@ class Process : public std::enable_shared_from_this<Process>,
 
   bool IsRunning() const;
 
-  lldb::RunDirection GetLastRunDirection() { return m_last_run_direction; }
-
   DynamicCheckerFunctions *GetDynamicCheckers() {
     return m_dynamic_checkers_up.get();
   }
@@ -2858,7 +2851,7 @@ void PruneThreadPlans();
   ///
   /// \return
   ///     An Status object describing the success or failure of the resume.
-  Status PrivateResume(lldb::RunDirection direction = lldb::eRunForward);
+  Status PrivateResume();
 
   // Called internally
   void CompleteAttach();
@@ -3134,8 +3127,6 @@ void PruneThreadPlans();
                            // m_currently_handling_do_on_removals are true,
                            // Resume will only request a resume, using this
                            // flag to check.
-  // The direction of execution from the last time this process was resumed.
-  lldb::RunDirection m_last_run_direction;
 
   lldb::tid_t m_interrupt_tid; /// The tid of the thread that issued the async
                                /// interrupt, used by thread plan timeout. It
diff --git a/lldb/include/lldb/Target/StopInfo.h b/lldb/include/lldb/Target/StopInfo.h
index 072f71f6b1122f..fae90364deaf0a 100644
--- a/lldb/include/lldb/Target/StopInfo.h
+++ b/lldb/include/lldb/Target/StopInfo.h
@@ -142,12 +142,6 @@ class StopInfo : public std::enable_shared_from_this<StopInfo> {
   static lldb::StopInfoSP
   CreateStopReasonProcessorTrace(Thread &thread, const char *description);
 
-  // This creates a StopInfo indicating that execution stopped because
-  // it was replaying some recorded execution history, and execution reached
-  // the end of that recorded history.
-  static lldb::StopInfoSP
-  CreateStopReasonHistoryBoundary(Thread &thread, const char *description);
-
   static lldb::StopInfoSP CreateStopReasonFork(Thread &thread,
                                                lldb::pid_t child_pid,
                                                lldb::tid_t child_tid);
diff --git a/lldb/include/lldb/lldb-enumerations.h b/lldb/include/lldb/lldb-enumerations.h
index 232d1dfdb5c9d0..938f6e3abe8f2a 100644
--- a/lldb/include/lldb/lldb-enumerations.h
+++ b/lldb/include/lldb/lldb-enumerations.h
@@ -135,9 +135,6 @@ FLAGS_ENUM(LaunchFlags){
 /// Thread Run Modes.
 enum RunMode { eOnlyThisThread, eAllThreads, eOnlyDuringStepping };
 
-/// Execution directions
-enum RunDirection { eRunForward, eRunReverse };
-
 /// Byte ordering definitions.
 enum ByteOrder {
   eByteOrderInvalid = 0,
@@ -257,9 +254,6 @@ enum StopReason {
   eStopReasonVFork,
   eStopReasonVForkDone,
   eStopReasonInterrupt, ///< Thread requested interrupt
-  // Indicates that execution stopped because the debugger backend relies
-  // on recorded data and we reached the end of that data.
-  eStopReasonHistoryBoundary,
 };
 
 /// Command Return Status Types.
diff --git a/lldb/packages/Python/lldbsuite/test/gdbclientutils.py b/lldb/packages/Python/lldbsuite/test/gdbclientutils.py
index 732d6171320680..1784487323ad6b 100644
--- a/lldb/packages/Python/lldbsuite/test/gdbclientutils.py
+++ b/lldb/packages/Python/lldbsuite/test/gdbclientutils.py
@@ -510,9 +510,8 @@ def start(self):
         self._thread.start()
 
     def stop(self):
-        if self._thread is not None:
-            self._thread.join()
-            self._thread = None
+        self._thread.join()
+        self._thread = None
 
     def get_connect_address(self):
         return self._socket.get_connect_address()
diff --git a/lldb/packages/Python/lldbsuite/test/lldbgdbproxy.py b/lldb/packages/Python/lldbsuite/test/lldbgdbproxy.py
deleted file mode 100644
index 2a9592bf4545a4..00000000000000
--- a/lldb/packages/Python/lldbsuite/test/lldbgdbproxy.py
+++ /dev/null
@@ -1,175 +0,0 @@
-import logging
-import os
-import os.path
-import random
-
-import lldb
-from lldbsuite.test.lldbtest import *
-from lldbsuite.test.gdbclientutils import *
-import lldbgdbserverutils
-from lldbsuite.support import seven
-
-
-class GDBProxyTestBase(TestBase):
-    """
-    Base class for gdbserver proxy tests.
-
-    This class will setup and start a mock GDB server for the test to use.
-    It pases through requests to a regular lldb-server/debugserver and
-    forwards replies back to the LLDB under test.
-    """
-
-    """The gdbserver that we implement."""
-    server = None
-    """The inner lldb-server/debugserver process that we proxy requests into."""
-    monitor_server = None
-    monitor_sock = None
-
-    server_socket_class = TCPServerSocket
-
-    DEFAULT_TIMEOUT = 20 * (10 if ("ASAN_OPTIONS" in os.environ) else 1)
-
-    _verbose_log_handler = None
-    _log_formatter = logging.Formatter(fmt="%(asctime)-15s %(levelname)-8s %(message)s")
-
-    def setUpBaseLogging(self):
-        self.logger = logging.getLogger(__name__)
-
-        if len(self.logger.handlers) > 0:
-            return  # We have set up this handler already
-
-        self.logger.propagate = False
-        self.logger.setLevel(logging.DEBUG)
-
-        # log all warnings to stderr
-        handler = logging.StreamHandler()
-        handler.setLevel(logging.WARNING)
-        handler.setFormatter(self._log_formatter)
-        self.logger.addHandler(handler)
-
-    def setUp(self):
-        TestBase.setUp(self)
-
-        self.setUpBaseLogging()
-
-        if self.isVerboseLoggingRequested():
-            # If requested, full logs go to a log file
-            log_file_name = self.getLogBasenameForCurrentTest() + "-proxy.log"
-            self._verbose_log_handler = logging.FileHandler(
-               log_file_name
-            )
-            self._verbose_log_handler.setFormatter(self._log_formatter)
-            self._verbose_log_handler.setLevel(logging.DEBUG)
-            self.logger.addHandler(self._verbose_log_handler)
-
-        lldb_server_exe = lldbgdbserverutils.get_lldb_server_exe()
-        if lldb_server_exe is None:
-            self.debug_monitor_exe = lldbgdbserverutils.get_debugserver_exe()
-            self.assertTrue(self.debug_monitor_exe is not None)
-            self.debug_monitor_extra_args = []
-        else:
-            self.debug_monitor_exe = lldb_server_exe
-            self.debug_monitor_extra_args = ["gdbserver"]
-
-        self.server = MockGDBServer(self.server_socket_class())
-        self.server.responder = self
-
-    def tearDown(self):
-        # TestBase.tearDown will kill the process, but we need to kill it early
-        # so its client connection closes and we can stop the server before
-        # finally calling the base tearDown.
-        if self.process() is not None:
-            self.process().Kill()
-        self.server.stop()
-
-        self.logger.removeHandler(self._verbose_log_handler)
-        self._verbose_log_handler = None
-
-        TestBase.tearDown(self)
-
-    def isVerboseLoggingRequested(self):
-        # We will report our detailed logs if the user requested that the "gdb-remote" channel is
-        # logged.
-        return any(("gdb-remote" in channel) for channel in lldbtest_config.channels)
-
-    def connect(self, target):
-        """
-        Create a process by connecting to the mock GDB server.
-        """
-        self.prep_debug_monitor_and_inferior()
-        self.server.start()
-
-        listener = self.dbg.GetListener()
-        error = lldb.SBError()
-        process = target.ConnectRemote(
-            listener, self.server.get_connect_url(), "gdb-remote", error
-        )
-        self.assertTrue(error.Success(), error.description)
-        self.assertTrue(process, PROCESS_IS_VALID)
-        return process
-
-    def get_next_port(self):
-        return 12000 + random.randint(0, 3999)
-
-    def prep_debug_monitor_and_inferior(self):
-        inferior_exe_path = self.getBuildArtifact("a.out")
-        self.connect_to_debug_monitor([inferior_exe_path])
-        self.assertIsNotNone(self.monitor_server)
-        self.initial_handshake()
-
-    def initial_handshake(self):
-        self.monitor_server.send_packet(seven.bitcast_to_bytes("+"))
-        reply = seven.bitcast_to_string(self.monitor_server.get_normal_packet())
-        self.assertEqual(reply, "+")
-        self.monitor_server.send_packet(seven.bitcast_to_bytes("QStartNoAckMode"))
-        reply = seven.bitcast_to_string(self.monitor_server.get_normal_packet())
-        self.assertEqual(reply, "+")
-        reply = seven.bitcast_to_string(self.monitor_server.get_normal_packet())
-        self.assertEqual(reply, "OK")
-        self.monitor_server.send_packet(seven.bitcast_to_bytes("+"))
-        reply = seven.bitcast_to_string(self.monitor_server.get_normal_packet())
-        self.assertEqual(reply, "+")
-
-    def get_debug_monitor_command_line_args(self, connect_address, launch_args):
-        return self.debug_monitor_extra_args + ["--reverse-connect", connect_address] + launch_args
-
-    def launch_debug_monitor(self, launch_args):
-        family, type, proto, _, addr = socket.getaddrinfo(
-            "localhost", 0, proto=socket.IPPROTO_TCP
-        )[0]
-        sock = socket.socket(family, type, proto)
-        sock.settimeout(self.DEFAULT_TIMEOUT)
-        sock.bind(addr)
-        sock.listen(1)
-        addr = sock.getsockname()
-        connect_address = "[{}]:{}".format(*addr)
-
-        commandline_args = self.get_debug_monitor_command_line_args(
-            connect_address, launch_args
-        )
-
-        # Start the server.
-        self.logger.info(f"Spawning monitor {commandline_args}")
-        monitor_process = self.spawnSubprocess(
-            self.debug_monitor_exe, commandline_args, install_remote=False
-        )
-        self.assertIsNotNone(monitor_process)
-
-        self.monitor_sock = sock.accept()[0]
-        self.monitor_sock.settimeout(self.DEFAULT_TIMEOUT)
-        return monitor_process
-
-    def connect_to_debug_monitor(self, launch_args):
-        monitor_process = self.launch_debug_monitor(launch_args)
-        self.monitor_server = lldbgdbserverutils.Server(self.monitor_sock, monitor_process)
-
-    def respond(self, packet):
-        """Subclasses can override this to change how packets are handled."""
-        return self.pass_through(packet)
-
-    def pass_through(self, packet):
-        self.logger.info(f"Sending packet {packet}")
-        self.monitor_server.send_packet(seven.bitcast_to_bytes(packet))
-        reply = seven.bitcast_to_string(self.monitor_server.get_normal_packet())
-        self.logger.info(f"Received reply {reply}")
-        return reply
diff --git a/lldb/packages/Python/lldbsuite/test/lldbreverse.py b/lldb/packages/Python/lldbsuite/test/lldbreverse.py
deleted file mode 100644
index 0f02fdffbdeada..00000000000000
--- a/lldb/packages/Python/lldbsuite/test/lldbreverse.py
+++ /dev/null
@@ -1,418 +0,0 @@
-import os
-import os.path
-import lldb
-from lldbsuite.test.lldbtest import *
-from lldbsuite.test.gdbclientutils import *
-from lldbsuite.test.lldbgdbproxy import *
-import lldbgdbserverutils
-import re
-
-
-class ThreadSnapshot:
-    def __init__(self, thread_id, registers):
-        self.thread_id = thread_id
-        self.registers = registers
-
-
-class MemoryBlockSnapshot:
-    def __init__(self, address, data):
-        self.address = address
-        self.data = data
-
-
-class StateSnapshot:
-    def __init__(self, thread_snapshots, memory):
-        self.thread_snapshots = thread_snapshots
-        self.memory = memory
-        self.thread_id = None
-
-
-class RegisterInfo:
-    def __init__(self, lldb_index, bitsize, little_endian):
-        self.lldb_index = lldb_index
-        self.bitsize = bitsize
-        self.little_endian = little_endian
-
-
-BELOW_STACK_POINTER = 16384
-ABOVE_STACK_POINTER = 4096
-
-BLOCK_SIZE = 1024
-
-SOFTWARE_BREAKPOINTS = 0
-HARDWARE_BREAKPOINTS = 1
-WRITE_WATCHPOINTS = 2
-
-
-class ReverseTestBase(GDBProxyTestBase):
-    """
-    Base class for tests that need reverse execution.
-
-    This class uses a gdbserver proxy to add very limited reverse-
-    execution capability to lldb-server/debugserver for testing
-    purposes only.
-
-    To use this class, run the inferior forward until some stopping point.
-    Then call `start_recording()` and execute forward again until reaching
-    a software breakpoint; this class records the state before each execution executes.
-    At that point, the server will accept "bc" and "bs" packets to step
-    backwards through the state.
-    When executing during recording, we only allow single-step and continue without
-    delivering a signal, and only software breakpoint stops are allowed.
-
-    We assume that while recording is enabled, the only effects of instructions
-    are on general-purpose registers (read/written by the 'g' and 'G' packets)
-    and on memory bytes between [SP - BELOW_STACK_POINTER, SP + ABOVE_STACK_POINTER).
-    """
-
-    """
-    A list of StateSnapshots in time order.
-
-    There is one snapshot per single-stepped instruction,
-    representing the state before that instruction was
-    executed. The last snapshot in the list is the
-    snapshot before the last instruction was executed.
-    This is an undo log; we snapshot a superset of the state that may have
-    been changed by the instruction's execution.
-    """
-    snapshots = None
-    recording_enabled = False
-
-    breakpoints = None
-
-    pid = None
-
-    pc_register_info = None
-    sp_register_info = None
-    general_purpose_register_info = None
-
-    def __init__(self, *args, **kwargs):
-        GDBProxyTestBase.__init__(self, *args, **kwargs)
-        self.breakpoints = [set(), set(), set(), set(), set()]
-
-    def respond(self, packet):
-        if not packet:
-            raise ValueError("Invalid empty packet")
-        if packet == self.server.PACKET_INTERRUPT:
-            # Don't send a response. We'll just run to completion.
-            return []
-        if self.is_command(packet, "qSupported", ":"):
-            reply = self.pass_through(packet)
-            return reply + ";ReverseStep+;ReverseContinue+"
-        if self.is_command(packet, "vCont", ";"):
-            if self.recording_enabled:
-                return self.continue_with_recording(packet)
-            snapshots = []
-        if packet[0] == "c" or packet[0] == "s" or packet[0] == "C" or packet[0] == "S":
-            raise ValueError("LLDB should not be sending old-style continuation packets")
-        if packet == "bc":
-            return self.reverse_continue()
-        if packet == "bs":
-            return self.reverse_step()
-        if packet == 'jThreadsInfo':
-            # Suppress this because it contains thread stop reasons which we might
-            # need to modify, and we don't want to have to implement that.
-            return ""
-        if packet[0] == "z" or packet[0] == "Z":
-            reply = self.pass_through(packet)
-            if reply == "OK":
-                self.update_breakpoints(packet)
-            return reply
-        return GDBProxyTestBase.respond(self, packet)
-
-    def start_recording(self):
-        self.recording_enabled = True
-        self.snapshots = []
-
-    def stop_recording(self):
-        """
-        Don't record when executing foward.
-
-        Reverse execution is still supported until the next forward continue.
-        """
-        self.recording_enabled = False
-
-    def is_command(self, packet, cmd, follow_token):
-        return packet == cmd or packet[0:len(cmd) + 1] == cmd + follow_token
-
-    def update_breakpoints(self, packet):
-        m = re.match("([zZ])([01234]),([0-9a-f]+),([0-9a-f]+)", packet)
-        if m is None:
-            raise ValueError("Invalid breakpoint packet: " + packet)
-        t = int(m.group(2))
-        addr = int(m.group(3), 16)
-        kind = int(m.group(4), 16)
-        if m.group(1) == 'Z':
-            self.breakpoints[t].add((addr, kind))
-        else:
-            self.breakpoints[t].discard((addr, kind))
-
-    def breakpoint_triggered_at(self, pc):
-        if any(addr == pc for addr, kind in self.breakpoints[SOFTWARE_BREAKPOINTS]):
-            return True
-        if any(addr == pc for addr, kind in self.breakpoints[HARDWARE_BREAKPOINTS]):
-            return True
-        return False
-
-    def watchpoint_triggered(self, new_value_block, current_contents):
-        """Returns the address or None."""
-        for watch_addr, kind in breakpoints[WRITE_WATCHPOINTS]:
-            for offset in range(0, kind):
-                addr = watch_addr + offset
-                if (addr >= new_value_block.address and
-                    addr < new_value_block.address + len(new_value_block.data)):
-                    index = addr - new_value_block.address
-                    if new_value_block.data[index*2:(index + 1)*2] != current_contents[index*2:(index + 1)*2]:
-                        return watch_addr
-        return None
-
-    def continue_with_recording(self, packet):
-        self.logger.debug("Continue with recording enabled")
-
-        step_packet = "vCont;s"
-        if packet == "vCont":
-            requested_step = False
-        else:
-            m = re.match("vCont;(c|s)(.*)", packet)
-            if m is None:
-                raise ValueError("Unsupported vCont packet: " + packet)
-            requested_step = m.group(1) == 's'
-            step_packet += m.group(2)
-
-        while True:
-            snapshot = self.capture_snapshot()
-            reply = self.pass_through(step_packet)
-            (stop_signal, stop_pairs) = self.parse_stop(reply)
-            if stop_signal != 5:
-                raise ValueError("Unexpected stop signal: " + reply)
-            is_swbreak = False
-            thread_id = None
-            for key, value in stop_pairs.items():
-                if key == "thread":
-                    thread_id = self.parse_thread_id(value)
-                    continue
-                if re.match('[0-9a-f]+', key):
-                    continue
-                if key == "swbreak" or (key == "reason" and value == "breakpoint"):
-                    is_swbreak = True
-                    continue
-                if key in ["name", "threads", "thread-pcs", "reason"]:
-                    continue
-                raise ValueError(f"Unknown stop key '{key}' in {reply}")
-            if is_swbreak:
-                self.logger.debug("Recording stopped")
-                return reply
-            if thread_id is None:
-                return ValueError("Expected thread ID: " + reply)
-            snapshot.thread_id = thread_id
-            self.snapshots.append(snapshot)
-            if requested_step:
-                self.logger.debug("Recording stopped for step")
-                return reply
-
-    def parse_stop(self, reply):
-        result = {}
-        if not reply:
-            raise ValueError("Invalid empty packet")
-        if reply[0] == "T" and len(reply) >= 3:
-            result = {k:v for k, v in self.parse_pairs(reply[3:])}
-            return (int(reply[1:3], 16), result)
-        raise "Unsupported stop reply: " + reply
-
-    def parse_pairs(self, text):
-        for pair in text.split(";"):
-            if not pair:
-                continue
-            m = re.match("([^:]+):(.*)", pair)
-            if m is None:
-                raise ValueError("Invalid pair text: " + text)
-            yield (m.group(1), m.group(2))
-
-    def capture_snapshot(self):
-        """Snapshot all threads and their stack memories."""
-        self.ensure_register_info()
-        current_thread = self.get_current_thread()
-        thread_snapshots = []
-        memory = []
-        for thread_id in self.get_thread_list():
-            registers = {}
-            for index in sorted(self.general_purpose_register_info.keys()):
-                reply =  self.pass_through(f"p{index:x};thread:{thread_id:x};")
-                if reply == "" or reply[0] == 'E':
-                    raise ValueError("Can't read register")
-                registers[index] = reply
-            thread_snapshot = ThreadSnapshot(thread_id, registers)
-            thread_sp = self.get_register(self.sp_register_info, thread_snapshot.registers)
-            memory += self.read_memory(thread_sp - BELOW_STACK_POINTER, thread_sp + ABOVE_STACK_POINTER)
-            thread_snapshots.append(thread_snapshot)
-        self.set_current_thread(current_thread)
-        return StateSnapshot(thread_snapshots, memory)
-
-    def restore_snapshot(self, snapshot):
-        """
-        Restore the snapshot during reverse execution.
-
-        If this triggers a breakpoint or watchpoint, return the stop reply,
-        otherwise None.
-        """
-        current_thread = self.get_current_thread()
-        stop_reasons = []
-        for thread_snapshot in snapshot.thread_snapshots:
-            thread_id = thread_snapshot.thread_id
-            for lldb_index in sorted(thread_snapshot.registers.keys()):
-                data = thread_snapshot.registers[lldb_index]
-                reply = self.pass_through(f"P{lldb_index:x}={data};thread:{thread_id:x};")
-                if reply != "OK":
-                    raise ValueError("Can't restore thread register")
-            if thread_id == snapshot.thread_id:
-                new_pc = self.get_register(self.pc_register_info, thread_snapshot.registers)
-                if self.breakpoint_triggered_at(new_pc):
-                    stop_reasons.append([("reason", "breakpoint")])
-        self.set_current_thread(current_thread)
-        for block in snapshot.memory:
-            current_memory = self.pass_through(f"m{block.address:x},{(len(block.data)/2):x}")
-            if not current_memory or current_memory[0] == 'E':
-                raise ValueError("Can't read back memory")
-            reply = self.pass_through(f"M{block.address:x},{len(block.data)/2:x}:" + block.data)
-            if reply != "OK":
-                raise ValueError("Can't restore memory")
-            watch_addr = self.watchpoint_triggered(block, current_memory[1:])
-            if watch_addr is not None:
-                stop_reasons.append([("reason", "watchpoint"), ("watch", f"{watch_addr:x}")])
-        if stop_reasons:
-            pairs = ";".join(f"{key}:{value}" for key, value in stop_reasons[0])
-            return f"T05thread:{self.pid:x}.{snapshot.thread_id:x};{pairs};"
-        return None
-
-    def reverse_step(self):
-        if not self.snapshots:
-            self.logger.debug("Reverse-step at history boundary")
-            return self.history_boundary_reply(self.get_current_thread())
-        self.logger.debug("Reverse-step started")
-        snapshot = self.snapshots.pop()
-        stop_reply = self.restore_snapshot(snapshot)
-        self.set_current_thread(snapshot.thread_id)
-        self.logger.debug("Reverse-step stopped")
-        if stop_reply is None:
-            return self.singlestep_stop_reply(snapshot.thread_id)
-        return stop_reply
-
-    def reverse_continue(self):
-        self.logger.debug("Reverse-continue started")
-        thread_id = None
-        while self.snapshots:
-            snapshot = self.snapshots.pop()
-            stop_reply = self.restore_snapshot(snapshot)
-            thread_id = snapshot.thread_id
-            if stop_reply is not None:
-                self.set_current_thread(thread_id)
-                self.logger.debug("Reverse-continue stopped")
-                return stop_reply
-        if thread_id is None:
-            thread_id = self.get_current_thread()
-        else:
-            self.set_current_thread(snapshot.thread_id)
-        self.logger.debug("Reverse-continue stopped at history boundary")
-        return self.history_boundary_reply(thread_id)
-
-    def get_current_thread(self):
-        reply = self.pass_through("qC")
-        return self.parse_thread_id(reply[2:])
-
-    def parse_thread_id(self, thread_id):
-        m = re.match("(p([0-9a-f]+)[.])?([0-9a-f]+)$", thread_id)
-        if m is None:
-            raise ValueError("Invalid thread ID: " + thread_id)
-        if self.pid is None:
-            self.pid = int(m.group(2), 16)
-        return int(m.group(3), 16)
-
-    def history_boundary_reply(self, thread_id):
-        return f"T00thread:{self.pid:x}.{thread_id:x};replaylog:begin;"
-
-    def singlestep_stop_reply(self, thread_id):
-        return f"T05thread:{self.pid:x}.{thread_id:x};"
-
-    def set_current_thread(self, thread_id):
-        """
-        Set current thread in inner gdbserver.
-        """
-        if thread_id >= 0:
-            self.pass_through(f"Hg{self.pid:x}.{thread_id:x}")
-            self.pass_through(f"Hc{self.pid:x}.{thread_id:x}")
-        else:
-            self.pass_through(f"Hc-1.-1")
-            self.pass_through(f"Hg-1.-1")
-
-    def get_register(self, register_info, registers):
-        if register_info.bitsize % 8 != 0:
-            raise ValueError("Register size must be a multiple of 8 bits")
-        if register_info.lldb_index not in registers:
-            raise ValueError("Register value not captured")
-        data = registers[register_info.lldb_index]
-        num_bytes = register_info.bitsize//8
-        bytes = []
-        for i in range(0, num_bytes):
-            bytes.append(int(data[i*2:(i + 1)*2], 16))
-        if register_info.little_endian:
-            bytes.reverse()
-        result = 0
-        for byte in bytes:
-            result = (result << 8) + byte
-        return result
-
-    def read_memory(self, start_addr, end_addr):
-        """
-        Read a region of memory from the target.
-
-        Some of the addresses may extend into invalid virtual memory;
-        skip those areas.
-        Return a list of blocks containing the valid area(s) in the
-        requested range.
-        """
-        regions = []
-        start_addr = start_addr & (BLOCK_SIZE - 1)
-        end_addr = (end_addr + BLOCK_SIZE - 1) & (BLOCK_SIZE - 1)
-        for addr in range(start_addr, end_addr, BLOCK_SIZE):
-            reply = self.pass_through(f"m{addr:x},{(BLOCK_SIZE - 1):x}")
-            if reply and reply[0] != 'E':
-                block = MemoryBlockSnapshot(addr, reply[1:])
-                regions.append(block)
-        return regions
-
-    def ensure_register_info(self):
-        if self.general_purpose_register_info is not None:
-            return
-        reply = self.pass_through("qHostInfo")
-        little_endian = any(kv == ("endian", "little") for kv in self.parse_pairs(reply))
-        self.general_purpose_register_info = {}
-        lldb_index = 0
-        while True:
-            reply = self.pass_through(f"qRegisterInfo{lldb_index:x}")
-            if not reply or reply[0] == 'E':
-                break
-            info = {k:v for k, v in self.parse_pairs(reply)}
-            reg_info = RegisterInfo(lldb_index, int(info["bitsize"]), little_endian)
-            if info["set"] == "General Purpose Registers" and not "container-regs" in info:
-                self.general_purpose_register_info[lldb_index] = reg_info
-            if "generic" in info:
-                if info["generic"] == "pc":
-                    self.pc_register_info = reg_info
-                elif info["generic"] == "sp":
-                    self.sp_register_info = reg_info
-            lldb_index += 1
-        if self.pc_register_info is None or self.sp_register_info is None:
-            raise ValueError("Can't find generic pc or sp register")
-
-    def get_thread_list(self):
-        threads = []
-        reply = self.pass_through("qfThreadInfo")
-        while True:
-            if not reply:
-                raise ValueError("Missing reply packet")
-            if reply[0] == 'm':
-                for id in reply[1:].split(","):
-                    threads.append(self.parse_thread_id(id))
-            elif reply[0] == 'l':
-                return threads
-            reply = self.pass_through("qsThreadInfo")
diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py
index 7cc1ac9749ec93..8884ef5933ada8 100644
--- a/lldb/packages/Python/lldbsuite/test/lldbtest.py
+++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py
@@ -143,8 +143,6 @@
 
 STOPPED_DUE_TO_WATCHPOINT = "Process should be stopped due to watchpoint"
 
-STOPPED_DUE_TO_HISTORY_BOUNDARY = "Process should be stopped due to history boundary"
-
 DATA_TYPES_DISPLAYED_CORRECTLY = "Data type(s) displayed correctly"
 
 VALID_BREAKPOINT = "Got a valid breakpoint"
diff --git a/lldb/source/API/SBProcess.cpp b/lldb/source/API/SBProcess.cpp
index 07780f9f9c8393..9773144723c34c 100644
--- a/lldb/source/API/SBProcess.cpp
+++ b/lldb/source/API/SBProcess.cpp
@@ -564,10 +564,6 @@ uint32_t SBProcess::GetAddressByteSize() const {
 }
 
 SBError SBProcess::Continue() {
-  return Continue(RunDirection::eRunForward);
-}
-
-SBError SBProcess::Continue(RunDirection direction) {
   LLDB_INSTRUMENT_VA(this);
 
   SBError sb_error;
@@ -578,9 +574,9 @@ SBError SBProcess::Continue(RunDirection direction) {
         process_sp->GetTarget().GetAPIMutex());
 
     if (process_sp->GetTarget().GetDebugger().GetAsyncExecution())
-      sb_error.ref() = process_sp->Resume(direction);
+      sb_error.ref() = process_sp->Resume();
     else
-      sb_error.ref() = process_sp->ResumeSynchronous(nullptr, direction);
+      sb_error.ref() = process_sp->ResumeSynchronous(nullptr);
   } else
     sb_error = Status::FromErrorString("SBProcess is invalid");
 
diff --git a/lldb/source/API/SBThread.cpp b/lldb/source/API/SBThread.cpp
index aca8a039952960..a99456e06d0329 100644
--- a/lldb/source/API/SBThread.cpp
+++ b/lldb/source/API/SBThread.cpp
@@ -172,7 +172,6 @@ size_t SBThread::GetStopReasonDataCount() {
         case eStopReasonInstrumentation:
         case eStopReasonProcessorTrace:
         case eStopReasonVForkDone:
-        case eStopReasonHistoryBoundary:
           // There is no data for these stop reasons.
           return 0;
 
@@ -234,7 +233,6 @@ uint64_t SBThread::GetStopReasonDataAtIndex(uint32_t idx) {
         case eStopReasonInstrumentation:
         case eStopReasonProcessorTrace:
         case eStopReasonVForkDone:
-        case eStopReasonHistoryBoundary:
           // There is no data for these stop reasons.
           return 0;
 
diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp
index ea60492ac46a10..8d3a82ef6c990a 100644
--- a/lldb/source/Interpreter/CommandInterpreter.cpp
+++ b/lldb/source/Interpreter/CommandInterpreter.cpp
@@ -2553,8 +2553,7 @@ bool CommandInterpreter::DidProcessStopAbnormally() const {
     const StopReason reason = stop_info->GetStopReason();
     if (reason == eStopReasonException ||
         reason == eStopReasonInstrumentation ||
-        reason == eStopReasonProcessorTrace || reason == eStopReasonInterrupt ||
-        reason == eStopReasonHistoryBoundary)
+        reason == eStopReasonProcessorTrace || reason == eStopReasonInterrupt)
       return true;
 
     if (reason == eStopReasonSignal) {
diff --git a/lldb/source/Plugins/Process/Linux/NativeThreadLinux.cpp b/lldb/source/Plugins/Process/Linux/NativeThreadLinux.cpp
index b0aa664775b463..de047ee214c11e 100644
--- a/lldb/source/Plugins/Process/Linux/NativeThreadLinux.cpp
+++ b/lldb/source/Plugins/Process/Linux/NativeThreadLinux.cpp
@@ -82,9 +82,6 @@ void LogThreadStopInfo(Log &log, const ThreadStopInfo &stop_info,
   case eStopReasonProcessorTrace:
     log.Printf("%s: %s processor trace", __FUNCTION__, header);
     return;
-  case eStopReasonHistoryBoundary:
-    log.Printf("%s: %s history boundary", __FUNCTION__, header);
-    return;
   default:
     log.Printf("%s: %s invalid stop reason %" PRIu32, __FUNCTION__, header,
                static_cast<uint32_t>(stop_info.reason));
diff --git a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp
index 116c43343c01d1..9b2907c6809965 100644
--- a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp
+++ b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp
@@ -402,16 +402,9 @@ lldb_private::DynamicLoader *ProcessKDP::GetDynamicLoader() {
 
 Status ProcessKDP::WillResume() { return Status(); }
 
-Status ProcessKDP::DoResume(RunDirection direction) {
+Status ProcessKDP::DoResume() {
   Status error;
   Log *log = GetLog(KDPLog::Process);
-
-  if (direction == RunDirection::eRunReverse) {
-    error.SetErrorStringWithFormatv(
-        "error: {0} does not support reverse execution of processes", GetPluginName());
-    return error;
-  }
-
   // Only start the async thread if we try to do any process control
   if (!m_async_thread.IsJoinable())
     StartAsyncThread();
diff --git a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.h b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.h
index 1b71d83f70b087..e5ec5914f9600d 100644
--- a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.h
+++ b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.h
@@ -90,7 +90,7 @@ class ProcessKDP : public lldb_private::Process {
   // Process Control
   lldb_private::Status WillResume() override;
 
-  lldb_private::Status DoResume(lldb::RunDirection direction) override;
+  lldb_private::Status DoResume() override;
 
   lldb_private::Status DoHalt(bool &caused_stop) override;
 
diff --git a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp
index 76b7095deaa503..703aa082f0476f 100644
--- a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp
+++ b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp
@@ -204,17 +204,11 @@ ProcessWindows::DoAttachToProcessWithID(lldb::pid_t pid,
   return error;
 }
 
-Status ProcessWindows::DoResume(RunDirection direction) {
+Status ProcessWindows::DoResume() {
   Log *log = GetLog(WindowsLog::Process);
   llvm::sys::ScopedLock lock(m_mutex);
   Status error;
 
-  if (direction == RunDirection::eRunReverse) {
-    error.SetErrorStringWithFormatv(
-        "error: {0} does not support reverse execution of processes", GetPluginName());
-    return error;
-  }
-
   StateType private_state = GetPrivateState();
   if (private_state == eStateStopped || private_state == eStateCrashed) {
     LLDB_LOG(log, "process {0} is in state {1}.  Resuming...",
diff --git a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.h b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.h
index 97284b7cd1436e..e97cfb790248be 100644
--- a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.h
+++ b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.h
@@ -52,7 +52,7 @@ class ProcessWindows : public Process, public ProcessDebugger {
   Status DoAttachToProcessWithID(
       lldb::pid_t pid,
       const lldb_private::ProcessAttachInfo &attach_info) override;
-  Status DoResume(lldb::RunDirection direction) override;
+  Status DoResume() override;
   Status DoDestroy() override;
   Status DoHalt(bool &caused_stop) override;
 
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
index fc792a4409410b..e42526c8fd7266 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
@@ -199,20 +199,6 @@ uint64_t GDBRemoteCommunicationClient::GetRemoteMaxPacketSize() {
   return m_max_packet_size;
 }
 
-bool GDBRemoteCommunicationClient::GetReverseContinueSupported() {
-  if (m_supports_reverse_continue == eLazyBoolCalculate) {
-    GetRemoteQSupported();
-  }
-  return m_supports_reverse_continue == eLazyBoolYes;
-}
-
-bool GDBRemoteCommunicationClient::GetReverseStepSupported() {
-  if (m_supports_reverse_step == eLazyBoolCalculate) {
-    GetRemoteQSupported();
-  }
-  return m_supports_reverse_step == eLazyBoolYes;
-}
-
 bool GDBRemoteCommunicationClient::QueryNoAckModeSupported() {
   if (m_supports_not_sending_acks == eLazyBoolCalculate) {
     m_send_acks = true;
@@ -309,8 +295,6 @@ void GDBRemoteCommunicationClient::ResetDiscoverableSettings(bool did_exec) {
     m_supports_qXfer_siginfo_read = eLazyBoolCalculate;
     m_supports_augmented_libraries_svr4_read = eLazyBoolCalculate;
     m_uses_native_signals = eLazyBoolCalculate;
-    m_supports_reverse_continue = eLazyBoolCalculate;
-    m_supports_reverse_step = eLazyBoolCalculate;
     m_supports_qProcessInfoPID = true;
     m_supports_qfProcessInfo = true;
     m_supports_qUserName = true;
@@ -364,8 +348,6 @@ void GDBRemoteCommunicationClient::GetRemoteQSupported() {
   m_supports_memory_tagging = eLazyBoolNo;
   m_supports_qSaveCore = eLazyBoolNo;
   m_uses_native_signals = eLazyBoolNo;
-  m_supports_reverse_continue = eLazyBoolNo;
-  m_supports_reverse_step = eLazyBoolNo;
 
   m_max_packet_size = UINT64_MAX; // It's supposed to always be there, but if
                                   // not, we assume no limit
@@ -419,10 +401,6 @@ void GDBRemoteCommunicationClient::GetRemoteQSupported() {
         m_supports_qSaveCore = eLazyBoolYes;
       else if (x == "native-signals+")
         m_uses_native_signals = eLazyBoolYes;
-      else if (x == "ReverseContinue+")
-        m_supports_reverse_continue = eLazyBoolYes;
-      else if (x == "ReverseStep+")
-        m_supports_reverse_step = eLazyBoolYes;
       // Look for a list of compressions in the features list e.g.
       // qXfer:features:read+;PacketSize=20000;qEcho+;SupportedCompressions=zlib-
       // deflate,lzma
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h
index 116b47c1edf033..898d176abc3465 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h
@@ -331,10 +331,6 @@ class GDBRemoteCommunicationClient : public GDBRemoteClientBase {
 
   bool GetMultiprocessSupported();
 
-  bool GetReverseContinueSupported();
-
-  bool GetReverseStepSupported();
-
   LazyBool SupportsAllocDeallocMemory() // const
   {
     // Uncomment this to have lldb pretend the debug server doesn't respond to
@@ -565,8 +561,6 @@ class GDBRemoteCommunicationClient : public GDBRemoteClientBase {
   LazyBool m_supports_memory_tagging = eLazyBoolCalculate;
   LazyBool m_supports_qSaveCore = eLazyBoolCalculate;
   LazyBool m_uses_native_signals = eLazyBoolCalculate;
-  LazyBool m_supports_reverse_continue = eLazyBoolCalculate;
-  LazyBool m_supports_reverse_step = eLazyBoolCalculate;
 
   bool m_supports_qProcessInfoPID : 1, m_supports_qfProcessInfo : 1,
       m_supports_qUserName : 1, m_supports_qGroupName : 1,
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp
index 4016cde74ebea8..35fa93e53bc66f 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp
@@ -716,7 +716,6 @@ static const char *GetStopReasonString(StopReason stop_reason) {
     return "vforkdone";
   case eStopReasonInterrupt:
     return "async interrupt";
-  case eStopReasonHistoryBoundary:
   case eStopReasonInstrumentation:
   case eStopReasonInvalid:
   case eStopReasonPlanComplete:
diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
index 3fc03bd05d5df0..3e09c316d74f44 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
@@ -169,10 +169,6 @@ class PluginProperties : public Properties {
   }
 };
 
-std::chrono::seconds ResumeTimeout() {
-  return std::chrono::seconds(5);
-}
-
 } // namespace
 
 static PluginProperties &GetGlobalPluginProperties() {
@@ -1184,11 +1180,10 @@ Status ProcessGDBRemote::WillResume() {
   return Status();
 }
 
-Status ProcessGDBRemote::DoResume(RunDirection direction) {
+Status ProcessGDBRemote::DoResume() {
   Status error;
   Log *log = GetLog(GDBRLog::Process);
-  LLDB_LOGF(log, "ProcessGDBRemote::Resume(%s)",
-            direction == RunDirection::eRunForward ? "" : "reverse");
+  LLDB_LOGF(log, "ProcessGDBRemote::Resume()");
 
   ListenerSP listener_sp(
       Listener::MakeListener("gdb-remote.resume-packet-sent"));
@@ -1202,21 +1197,12 @@ Status ProcessGDBRemote::DoResume(RunDirection direction) {
 
     StreamString continue_packet;
     bool continue_packet_error = false;
-    // Number of threads continuing with "c", i.e. continuing without a signal to deliver.
-    const size_t num_continue_c_tids = m_continue_c_tids.size();
-    // Number of threads continuing with "C", i.e. continuing with a signal to deliver.
-    const size_t num_continue_C_tids = m_continue_C_tids.size();
-    // Number of threads continuing with "s", i.e. single-stepping.
-    const size_t num_continue_s_tids = m_continue_s_tids.size();
-    // Number of threads continuing with "S", i.e. single-stepping with a signal to deliver.
-    const size_t num_continue_S_tids = m_continue_S_tids.size();
-    if (direction == RunDirection::eRunForward &&
-        m_gdb_comm.HasAnyVContSupport()) {
+    if (m_gdb_comm.HasAnyVContSupport()) {
       std::string pid_prefix;
       if (m_gdb_comm.GetMultiprocessSupported())
         pid_prefix = llvm::formatv("p{0:x-}.", GetID());
 
-      if (num_continue_c_tids == num_threads ||
+      if (m_continue_c_tids.size() == num_threads ||
           (m_continue_c_tids.empty() && m_continue_C_tids.empty() &&
            m_continue_s_tids.empty() && m_continue_S_tids.empty())) {
         // All threads are continuing
@@ -1279,11 +1265,14 @@ Status ProcessGDBRemote::DoResume(RunDirection direction) {
     } else
       continue_packet_error = true;
 
-    if (direction == RunDirection::eRunForward && continue_packet_error) {
+    if (continue_packet_error) {
       // Either no vCont support, or we tried to use part of the vCont packet
-      // that wasn't supported by the remote GDB server, or it's the reverse
-      // direction. We need to try and make a simple packet that can do our
-      // continue.
+      // that wasn't supported by the remote GDB server. We need to try and
+      // make a simple packet that can do our continue
+      const size_t num_continue_c_tids = m_continue_c_tids.size();
+      const size_t num_continue_C_tids = m_continue_C_tids.size();
+      const size_t num_continue_s_tids = m_continue_s_tids.size();
+      const size_t num_continue_S_tids = m_continue_S_tids.size();
       if (num_continue_c_tids > 0) {
         if (num_continue_c_tids == num_threads) {
           // All threads are resuming...
@@ -1374,41 +1363,9 @@ Status ProcessGDBRemote::DoResume(RunDirection direction) {
       }
     }
 
-    if (direction == RunDirection::eRunReverse && continue_packet_error) {
-      if (num_continue_C_tids > 0 || num_continue_S_tids > 0) {
-        LLDB_LOGF(log, "ProcessGDBRemote::DoResumeReverse: Signals not supported");
-        return Status::FromErrorString("can't deliver signals while running in reverse");
-      }
-
-      if (num_continue_s_tids > 0) {
-        if (num_continue_s_tids > 1) {
-          LLDB_LOGF(log, "ProcessGDBRemote::DoResumeReverse: can't step multiple threads");
-          return Status::FromErrorString("can't step multiple threads while reverse-stepping");
-        }
-
-        if (!m_gdb_comm.GetReverseStepSupported()) {
-          LLDB_LOGF(log, "ProcessGDBRemote::DoResumeReverse: target does not support reverse-stepping");
-          return Status::FromErrorString("target does not support reverse-stepping");
-        }
-
-        m_gdb_comm.SetCurrentThreadForRun(m_continue_s_tids.front());
-        continue_packet.PutCString("bs");
-      } else {
-        if (!m_gdb_comm.GetReverseContinueSupported()) {
-          LLDB_LOGF(log, "ProcessGDBRemote::DoResumeReverse: target does not support reverse-continue");
-          return Status::FromErrorString("target does not support reverse-continue");
-        }
-
-        // All threads continue whether requested or not ---
-        // we can't change how threads ran in the past.
-        continue_packet.PutCString("bc");
-      }
-
-      continue_packet_error = false;
-    }
-
     if (continue_packet_error) {
-      return Status::FromErrorString("can't make continue packet for this resume");
+      error =
+          Status::FromErrorString("can't make continue packet for this resume");
     } else {
       EventSP event_sp;
       if (!m_async_thread.IsJoinable()) {
@@ -1423,7 +1380,7 @@ Status ProcessGDBRemote::DoResume(RunDirection direction) {
           std::make_shared<EventDataBytes>(continue_packet.GetString());
       m_async_broadcaster.BroadcastEvent(eBroadcastBitAsyncContinue, data_sp);
 
-      if (!listener_sp->GetEvent(event_sp, ResumeTimeout())) {
+      if (!listener_sp->GetEvent(event_sp, std::chrono::seconds(5))) {
         error = Status::FromErrorString("Resume timed out.");
         LLDB_LOGF(log, "ProcessGDBRemote::DoResume: Resume timed out.");
       } else if (event_sp->BroadcasterIs(&m_async_broadcaster)) {
@@ -1906,10 +1863,6 @@ ThreadSP ProcessGDBRemote::SetThreadStopInfo(
           thread_sp->SetStopInfo(StopInfo::CreateStopReasonWithException(
               *thread_sp, description.c_str()));
           handled = true;
-        } else if (reason == "replaylog") {
-          thread_sp->SetStopInfo(StopInfo::CreateStopReasonHistoryBoundary(
-              *thread_sp, description.c_str()));
-          handled = true;
         } else if (reason == "exec") {
           did_exec = true;
           thread_sp->SetStopInfo(
@@ -2365,8 +2318,6 @@ StateType ProcessGDBRemote::SetThreadStopInfo(StringExtractor &stop_packet) {
         description = std::string(ostr.GetString());
       } else if (key.compare("swbreak") == 0 || key.compare("hwbreak") == 0) {
         reason = "breakpoint";
-      } else if (key.compare("replaylog") == 0) {
-        reason = "replaylog";
       } else if (key.compare("library") == 0) {
         auto error = LoadModules();
         if (error) {
diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h
index fa3e1cec76e2b3..2492795851388a 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h
@@ -111,7 +111,7 @@ class ProcessGDBRemote : public Process,
   // Process Control
   Status WillResume() override;
 
-  Status DoResume(lldb::RunDirection direction) override;
+  Status DoResume() override;
 
   Status DoHalt(bool &caused_stop) override;
 
diff --git a/lldb/source/Plugins/Process/scripted/ScriptedProcess.cpp b/lldb/source/Plugins/Process/scripted/ScriptedProcess.cpp
index 304c12173dd35d..d2111ce877ce55 100644
--- a/lldb/source/Plugins/Process/scripted/ScriptedProcess.cpp
+++ b/lldb/source/Plugins/Process/scripted/ScriptedProcess.cpp
@@ -182,15 +182,10 @@ void ScriptedProcess::DidResume() {
   m_pid = GetInterface().GetProcessID();
 }
 
-Status ScriptedProcess::DoResume(RunDirection direction) {
+Status ScriptedProcess::DoResume() {
   LLDB_LOGF(GetLog(LLDBLog::Process), "ScriptedProcess::%s resuming process", __FUNCTION__);
 
-  if (direction == RunDirection::eRunForward) {
-    return GetInterface().Resume();
-  } else {
-    return Status::FromErrorStringWithFormatv(
-        "error: {0} does not support reverse execution of processes", GetPluginName());
-  }
+  return GetInterface().Resume();
 }
 
 Status ScriptedProcess::DoAttach(const ProcessAttachInfo &attach_info) {
diff --git a/lldb/source/Plugins/Process/scripted/ScriptedProcess.h b/lldb/source/Plugins/Process/scripted/ScriptedProcess.h
index 8ebe4ca5f3d449..0335364b4010b2 100644
--- a/lldb/source/Plugins/Process/scripted/ScriptedProcess.h
+++ b/lldb/source/Plugins/Process/scripted/ScriptedProcess.h
@@ -52,7 +52,7 @@ class ScriptedProcess : public Process {
 
   void DidResume() override;
 
-  Status DoResume(lldb::RunDirection direction) override;
+  Status DoResume() override;
 
   Status DoAttachToProcessWithID(lldb::pid_t pid,
                                  const ProcessAttachInfo &attach_info) override;
diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp
index ff6a2f59eba35f..aca08972811470 100644
--- a/lldb/source/Target/Process.cpp
+++ b/lldb/source/Target/Process.cpp
@@ -446,8 +446,7 @@ Process::Process(lldb::TargetSP target_sp, ListenerSP listener_sp,
       m_memory_cache(*this), m_allocated_memory_cache(*this),
       m_should_detach(false), m_next_event_action_up(), m_public_run_lock(),
       m_private_run_lock(), m_currently_handling_do_on_removals(false),
-      m_resume_requested(false), m_last_run_direction(eRunForward),
-      m_interrupt_tid(LLDB_INVALID_THREAD_ID),
+      m_resume_requested(false), m_interrupt_tid(LLDB_INVALID_THREAD_ID),
       m_finalizing(false), m_destructing(false),
       m_clear_thread_plans_on_stop(false), m_force_next_event_delivery(false),
       m_last_broadcast_state(eStateInvalid), m_destroy_in_process(false),
@@ -846,7 +845,6 @@ bool Process::HandleProcessStateChangedEvent(
             switch (thread_stop_reason) {
             case eStopReasonInvalid:
             case eStopReasonNone:
-            case eStopReasonHistoryBoundary:
               break;
 
             case eStopReasonSignal: {
@@ -1354,7 +1352,7 @@ void Process::SetPublicState(StateType new_state, bool restarted) {
   }
 }
 
-Status Process::Resume(RunDirection direction) {
+Status Process::Resume() {
   Log *log(GetLog(LLDBLog::State | LLDBLog::Process));
   LLDB_LOGF(log, "(plugin = %s) -- locking run lock", GetPluginName().data());
   if (!m_public_run_lock.TrySetRunning()) {
@@ -1363,7 +1361,7 @@ Status Process::Resume(RunDirection direction) {
     return Status::FromErrorString(
         "Resume request failed - process still running.");
   }
-  Status error = PrivateResume(direction);
+  Status error = PrivateResume();
   if (!error.Success()) {
     // Undo running state change
     m_public_run_lock.SetStopped();
@@ -1371,7 +1369,7 @@ Status Process::Resume(RunDirection direction) {
   return error;
 }
 
-Status Process::ResumeSynchronous(Stream *stream, RunDirection direction) {
+Status Process::ResumeSynchronous(Stream *stream) {
   Log *log(GetLog(LLDBLog::State | LLDBLog::Process));
   LLDB_LOGF(log, "Process::ResumeSynchronous -- locking run lock");
   if (!m_public_run_lock.TrySetRunning()) {
@@ -1384,7 +1382,7 @@ Status Process::ResumeSynchronous(Stream *stream, RunDirection direction) {
       Listener::MakeListener(ResumeSynchronousHijackListenerName.data()));
   HijackProcessEvents(listener_sp);
 
-  Status error = PrivateResume(direction);
+  Status error = PrivateResume();
   if (error.Success()) {
     StateType state =
         WaitForProcessToStop(std::nullopt, nullptr, true, listener_sp, stream,
@@ -3241,7 +3239,7 @@ Status Process::ConnectRemote(llvm::StringRef remote_url) {
   return error;
 }
 
-Status Process::PrivateResume(RunDirection direction) {
+Status Process::PrivateResume() {
   Log *log(GetLog(LLDBLog::Process | LLDBLog::Step));
   LLDB_LOGF(log,
             "Process::PrivateResume() m_stop_id = %u, public state: %s "
@@ -3257,15 +3255,6 @@ Status Process::PrivateResume(RunDirection direction) {
   if (!GetModID().IsLastResumeForUserExpression())
     ResetExtendedCrashInfoDict();
 
-  if (m_last_run_direction != direction) {
-    // In the future we might want to support mixed-direction plans,
-    // e.g. a forward step-over stops at a breakpoint, the user does
-    // a reverse-step, then disables the breakpoint and continues forward.
-    // This code will need to be changed to support that.
-    m_thread_list.DiscardThreadPlans();
-    m_last_run_direction = direction;
-  }
-
   Status error(WillResume());
   // Tell the process it is about to resume before the thread list
   if (error.Success()) {
@@ -3283,7 +3272,7 @@ Status Process::PrivateResume(RunDirection direction) {
             "Process::PrivateResume PreResumeActions failed, not resuming.");
       } else {
         m_mod_id.BumpResumeID();
-        error = DoResume(direction);
+        error = DoResume();
         if (error.Success()) {
           DidResume();
           m_thread_list.DidResume();
@@ -3746,7 +3735,7 @@ bool Process::ShouldBroadcastEvent(Event *event_ptr) {
                     "from state: %s",
                     static_cast<void *>(event_ptr), StateAsCString(state));
           ProcessEventData::SetRestartedInEvent(event_ptr, true);
-          PrivateResume(m_last_run_direction);
+          PrivateResume();
         }
       } else {
         return_value = true;
@@ -4357,7 +4346,7 @@ void Process::ProcessEventData::DoOnRemoval(Event *event_ptr) {
     SetRestarted(true);
     // Use the private resume method here, since we aren't changing the run
     // lock state.
-    process_sp->PrivateResume(process_sp->m_last_run_direction);
+    process_sp->PrivateResume();
   } else {
     bool hijacked = process_sp->IsHijackedForEvent(eBroadcastBitStateChanged) &&
                     !process_sp->StateChangedIsHijackedForSynchronousResume();
diff --git a/lldb/source/Target/StopInfo.cpp b/lldb/source/Target/StopInfo.cpp
index 08e9a7c099bad2..bd7032b803df90 100644
--- a/lldb/source/Target/StopInfo.cpp
+++ b/lldb/source/Target/StopInfo.cpp
@@ -1212,30 +1212,6 @@ class StopInfoProcessorTrace : public StopInfo {
   }
 };
 
-// StopInfoHistoryBoundary
-
-class StopInfoHistoryBoundary : public StopInfo {
-public:
-  StopInfoHistoryBoundary(Thread &thread, const char *description)
-      : StopInfo(thread, LLDB_INVALID_UID) {
-    if (description)
-      SetDescription(description);
-  }
-
-  ~StopInfoHistoryBoundary() override = default;
-
-  StopReason GetStopReason() const override {
-    return eStopReasonHistoryBoundary;
-  }
-
-  const char *GetDescription() override {
-    if (m_description.empty())
-      return "history boundary";
-    else
-      return m_description.c_str();
-  }
-};
-
 // StopInfoThreadPlan
 
 class StopInfoThreadPlan : public StopInfo {
@@ -1463,11 +1439,6 @@ StopInfoSP StopInfo::CreateStopReasonProcessorTrace(Thread &thread,
   return StopInfoSP(new StopInfoProcessorTrace(thread, description));
 }
 
-StopInfoSP StopInfo::CreateStopReasonHistoryBoundary(Thread &thread,
-                                                     const char *description) {
-  return StopInfoSP(new StopInfoHistoryBoundary(thread, description));
-}
-
 StopInfoSP StopInfo::CreateStopReasonWithExec(Thread &thread) {
   return StopInfoSP(new StopInfoExec(thread));
 }
diff --git a/lldb/source/Target/Thread.cpp b/lldb/source/Target/Thread.cpp
index bbb586f033b746..902fbb2b519ef7 100644
--- a/lldb/source/Target/Thread.cpp
+++ b/lldb/source/Target/Thread.cpp
@@ -624,12 +624,10 @@ void Thread::SetupForResume() {
     // what the current plan is.
 
     lldb::RegisterContextSP reg_ctx_sp(GetRegisterContext());
-    ProcessSP process_sp(GetProcess());
-    if (reg_ctx_sp && process_sp &&
-        process_sp->GetLastRunDirection() == eRunForward) {
+    if (reg_ctx_sp) {
       const addr_t thread_pc = reg_ctx_sp->GetPC();
       BreakpointSiteSP bp_site_sp =
-          process_sp->GetBreakpointSiteList().FindByAddress(thread_pc);
+          GetProcess()->GetBreakpointSiteList().FindByAddress(thread_pc);
       if (bp_site_sp) {
         // Note, don't assume there's a ThreadPlanStepOverBreakpoint, the
         // target may not require anything special to step over a breakpoint.
@@ -1734,8 +1732,6 @@ std::string Thread::StopReasonAsString(lldb::StopReason reason) {
     return "processor trace";
   case eStopReasonInterrupt:
     return "async interrupt";
-  case eStopReasonHistoryBoundary:
-    return "history boundary";
   }
 
   return "StopReason = " + std::to_string(reason);
diff --git a/lldb/test/API/functionalities/reverse-execution/Makefile b/lldb/test/API/functionalities/reverse-execution/Makefile
deleted file mode 100644
index 10495940055b63..00000000000000
--- a/lldb/test/API/functionalities/reverse-execution/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
-C_SOURCES := main.c
-
-include Makefile.rules
diff --git a/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py b/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py
deleted file mode 100644
index b37578fbd82468..00000000000000
--- a/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py
+++ /dev/null
@@ -1,115 +0,0 @@
-import lldb
-import time
-import unittest
-from lldbsuite.test.lldbtest import *
-from lldbsuite.test.decorators import *
-from lldbsuite.test.gdbclientutils import *
-from lldbsuite.test.lldbreverse import ReverseTestBase
-from lldbsuite.test import lldbutil
-
-
-class TestReverseContinueBreakpoints(ReverseTestBase):
-    NO_DEBUG_INFO_TESTCASE = True
-
-    def test_reverse_continue(self):
-        self.reverse_continue_internal(async_mode=False)
-
-    def test_reverse_continue_async(self):
-        self.reverse_continue_internal(async_mode=True)
-
-    def reverse_continue_internal(self, async_mode):
-        target, process, initial_threads = self.setup_recording(async_mode)
-
-        # Reverse-continue. We'll stop at the point where we started recording.
-        status = process.Continue(lldb.eRunReverse)
-        self.assertSuccess(status)
-        self.expect_async_state_changes(async_mode, process, [lldb.eStateRunning, lldb.eStateStopped])
-        self.expect(
-            "thread list",
-            STOPPED_DUE_TO_HISTORY_BOUNDARY,
-            substrs=["stopped", "stop reason = history boundary"],
-        )
-
-        # Continue forward normally until the target exits.
-        status = process.Continue()
-        self.expect_async_state_changes(async_mode, process, [lldb.eStateRunning, lldb.eStateExited])
-        self.assertSuccess(status)
-        self.assertState(process.GetState(), lldb.eStateExited)
-        self.assertEqual(process.GetExitStatus(), 0)
-
-    def test_reverse_continue_breakpoint(self):
-        self.reverse_continue_breakpoint_internal(async_mode=False)
-
-    def test_reverse_continue_breakpoint_async(self):
-        self.reverse_continue_breakpoint_internal(async_mode=True)
-
-    def reverse_continue_breakpoint_internal(self, async_mode):
-        target, process, initial_threads = self.setup_recording(async_mode)
-
-        # Reverse-continue to the function "trigger_breakpoint".
-        trigger_bkpt = target.BreakpointCreateByName("trigger_breakpoint", None)
-        status = process.Continue(lldb.eRunReverse)
-        self.assertSuccess(status)
-        self.expect_async_state_changes(async_mode, process, [lldb.eStateRunning, lldb.eStateStopped])
-        threads_now = lldbutil.get_threads_stopped_at_breakpoint(process, trigger_bkpt)
-        self.assertEqual(threads_now, initial_threads)
-
-    def test_reverse_continue_skip_breakpoint(self):
-        self.reverse_continue_skip_breakpoint_internal(async_mode=False)
-
-    def test_reverse_continue_skip_breakpoint_async(self):
-        self.reverse_continue_skip_breakpoint_internal(async_mode=True)
-
-    def reverse_continue_skip_breakpoint_internal(self, async_mode):
-        target, process, initial_threads = self.setup_recording(async_mode)
-
-        # Reverse-continue over a breakpoint at "trigger_breakpoint" whose
-        # condition is false.
-        # This tests that we continue in the correct direction after hitting
-        # the breakpoint.
-        trigger_bkpt = target.BreakpointCreateByName("trigger_breakpoint", None)
-        trigger_bkpt.SetCondition("false_condition")
-        status = process.Continue(lldb.eRunReverse)
-        self.expect_async_state_changes(async_mode, process, [lldb.eStateRunning, lldb.eStateStopped])
-        self.assertSuccess(status)
-        self.expect(
-            "thread list",
-            STOPPED_DUE_TO_HISTORY_BOUNDARY,
-            substrs=["stopped", "stop reason = history boundary"],
-        )
-
-    def setup_recording(self, async_mode):
-        """
-        Record execution of code between "start_recording" and "stop_recording" breakpoints.
-
-        Returns with the target stopped at "stop_recording", with recording disabled,
-        ready to reverse-execute.
-        """
-        self.build()
-        target = self.dbg.CreateTarget("")
-        process = self.connect(target)
-
-        # Record execution from the start of the function "start_recording"
-        # to the start of the function "stop_recording". We want to keep the
-        # interval that we record as small as possible to minimize the run-time
-        # of our single-stepping recorder.
-        start_recording_bkpt = target.BreakpointCreateByName("start_recording", None)
-        initial_threads = lldbutil.continue_to_breakpoint(process, start_recording_bkpt)
-        self.assertEqual(len(initial_threads), 1)
-        target.BreakpointDelete(start_recording_bkpt.GetID())
-        self.start_recording()
-        stop_recording_bkpt = target.BreakpointCreateByName("stop_recording", None)
-        lldbutil.continue_to_breakpoint(process, stop_recording_bkpt)
-        target.BreakpointDelete(stop_recording_bkpt.GetID())
-        self.stop_recording()
-
-        self.dbg.SetAsync(async_mode)
-        self.expect_async_state_changes(async_mode, process, [lldb.eStateStopped])
-
-        return target, process, initial_threads
-
-    def expect_async_state_changes(self, async_mode, process, states):
-        if not async_mode:
-            return
-        listener = self.dbg.GetListener()
-        lldbutil.expect_state_changes(self, listener, process, states)
diff --git a/lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py b/lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py
deleted file mode 100644
index d610761b8cb0bc..00000000000000
--- a/lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import lldb
-import unittest
-from lldbsuite.test.lldbtest import *
-from lldbsuite.test.decorators import *
-from lldbsuite.test import lldbutil
-
-
-class TestReverseContinueNotSupported(TestBase):
-    NO_DEBUG_INFO_TESTCASE = True
-
-    def test_reverse_continue_not_supported(self):
-        self.build()
-        exe = self.getBuildArtifact("a.out")
-        target = self.dbg.CreateTarget(exe)
-        self.assertTrue(target, VALID_TARGET)
-
-        main_bkpt = target.BreakpointCreateByName("main", None)
-        self.assertTrue(main_bkpt, VALID_BREAKPOINT)
-
-        process = target.LaunchSimple(None, None, self.get_process_working_directory())
-        self.assertTrue(process, PROCESS_IS_VALID)
-
-        # This will fail gracefully.
-        status = process.Continue(lldb.eRunReverse)
-        self.assertFailure(status, "target does not support reverse-continue")
-
-        status = process.Continue()
-        self.assertSuccess(status)
-        self.assertState(process.GetState(), lldb.eStateExited)
-        self.assertEqual(process.GetExitStatus(), 0)
diff --git a/lldb/test/API/functionalities/reverse-execution/main.c b/lldb/test/API/functionalities/reverse-execution/main.c
deleted file mode 100644
index 40e45dc9f5c317..00000000000000
--- a/lldb/test/API/functionalities/reverse-execution/main.c
+++ /dev/null
@@ -1,14 +0,0 @@
-volatile int false_condition = 0;
-
-static void start_recording() {}
-
-static void trigger_breakpoint() {}
-
-static void stop_recording() {}
-
-int main() {
-  start_recording();
-  trigger_breakpoint();
-  stop_recording();
-  return 0;
-}
diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp
index 211fd34957f496..558f889c4b7f23 100644
--- a/lldb/tools/lldb-dap/JSONUtils.cpp
+++ b/lldb/tools/lldb-dap/JSONUtils.cpp
@@ -1045,9 +1045,6 @@ llvm::json::Value CreateThreadStopped(lldb::SBThread &thread,
   case lldb::eStopReasonProcessorTrace:
     body.try_emplace("reason", "processor trace");
     break;
-  case lldb::eStopReasonHistoryBoundary:
-    body.try_emplace("reason", "history boundary");
-    break;
   case lldb::eStopReasonSignal:
   case lldb::eStopReasonException:
     body.try_emplace("reason", "exception");
diff --git a/lldb/tools/lldb-dap/LLDBUtils.cpp b/lldb/tools/lldb-dap/LLDBUtils.cpp
index 1c5e3ac7008727..b38833c0fdb6b6 100644
--- a/lldb/tools/lldb-dap/LLDBUtils.cpp
+++ b/lldb/tools/lldb-dap/LLDBUtils.cpp
@@ -111,7 +111,6 @@ bool ThreadHasStopReason(lldb::SBThread &thread) {
   case lldb::eStopReasonVFork:
   case lldb::eStopReasonVForkDone:
   case lldb::eStopReasonInterrupt:
-  case lldb::eStopReasonHistoryBoundary:
     return true;
   case lldb::eStopReasonThreadExiting:
   case lldb::eStopReasonInvalid:

From e9c8f75d45ababe7f805078bbf7bda2e7425f1b7 Mon Sep 17 00:00:00 2001
From: Jacob Lalonde <jalalonde@fb.com>
Date: Thu, 10 Oct 2024 15:59:51 -0700
Subject: [PATCH 090/177] [LLDB][Minidump] Have Minidumps save off and properly
 read TLS data (#109477)

This patch adds the support to `Process.cpp` to automatically save off
TLS sections, either via loading the memory region for the module, or
via reading `fs_base` via generic register. Then when Minidumps are
loaded, we now specify we want the dynamic loader to be the `POSIXDYLD`
so we can leverage the same TLS accessor code as `ProcessELFCore`. Being
able to access TLS Data is an important step for LLDB generated
minidumps to have feature parity with ELF Core dumps.
---
 lldb/include/lldb/Target/DynamicLoader.h      | 12 +++
 lldb/source/Core/DynamicLoader.cpp            |  7 +-
 .../POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp     | 80 +++++++++++++++++++
 .../POSIX-DYLD/DynamicLoaderPOSIXDYLD.h       |  6 ++
 .../Process/minidump/ProcessMinidump.cpp      | 20 ++++-
 .../Process/minidump/ProcessMinidump.h        |  5 +-
 .../RegisterContextMinidump_x86_64.cpp        | 20 ++++-
 lldb/source/Target/Process.cpp                | 36 ++++++++-
 .../TestProcessSaveCoreMinidump.py            | 77 +++++++++++++++++-
 .../process_save_core_minidump/main.cpp       |  1 +
 10 files changed, 248 insertions(+), 16 deletions(-)

diff --git a/lldb/include/lldb/Target/DynamicLoader.h b/lldb/include/lldb/Target/DynamicLoader.h
index 0629e2faae7e9e..75bb6cb6bb9074 100644
--- a/lldb/include/lldb/Target/DynamicLoader.h
+++ b/lldb/include/lldb/Target/DynamicLoader.h
@@ -11,6 +11,7 @@
 
 #include "lldb/Core/Address.h"
 #include "lldb/Core/PluginInterface.h"
+#include "lldb/Target/CoreFileMemoryRanges.h"
 #include "lldb/Utility/FileSpec.h"
 #include "lldb/Utility/Status.h"
 #include "lldb/Utility/UUID.h"
@@ -337,6 +338,17 @@ class DynamicLoader : public PluginInterface {
     return std::nullopt;
   }
 
+  /// Returns a list of memory ranges that should be saved in the core file,
+  /// specific for this dynamic loader.
+  ///
+  /// For example, an implementation of this function can save the thread
+  /// local data of a given thread.
+  virtual void CalculateDynamicSaveCoreRanges(
+      lldb_private::Process &process,
+      std::vector<lldb_private::MemoryRegionInfo> &ranges,
+      llvm::function_ref<bool(const lldb_private::Thread &)>
+          save_thread_predicate) {};
+
 protected:
   // Utility methods for derived classes
 
diff --git a/lldb/source/Core/DynamicLoader.cpp b/lldb/source/Core/DynamicLoader.cpp
index 7758a87403b5a3..68d6ab0850853f 100644
--- a/lldb/source/Core/DynamicLoader.cpp
+++ b/lldb/source/Core/DynamicLoader.cpp
@@ -83,7 +83,11 @@ ModuleSP DynamicLoader::GetTargetExecutable() {
       ModuleSpec module_spec(executable->GetFileSpec(),
                              executable->GetArchitecture());
       auto module_sp = std::make_shared<Module>(module_spec);
-
+      // If we're a coredump and we already have a main executable, we don't
+      // need to reload the module list that target already has
+      if (!m_process->IsLiveDebugSession()) {
+        return executable;
+      }
       // Check if the executable has changed and set it to the target
       // executable if they differ.
       if (module_sp && module_sp->GetUUID().IsValid() &&
@@ -369,4 +373,3 @@ void DynamicLoader::LoadOperatingSystemPlugin(bool flush)
     if (m_process)
         m_process->LoadOperatingSystemPlugin(flush);
 }
-
diff --git a/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp b/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp
index b9c0e174c3be68..34aca50df0ac4b 100644
--- a/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp
+++ b/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp
@@ -18,6 +18,7 @@
 #include "lldb/Symbol/ObjectFile.h"
 #include "lldb/Target/MemoryRegionInfo.h"
 #include "lldb/Target/Platform.h"
+#include "lldb/Target/RegisterContext.h"
 #include "lldb/Target/Target.h"
 #include "lldb/Target/Thread.h"
 #include "lldb/Target/ThreadPlanRunToAddress.h"
@@ -866,3 +867,82 @@ bool DynamicLoaderPOSIXDYLD::AlwaysRelyOnEHUnwindInfo(
 bool DynamicLoaderPOSIXDYLD::IsCoreFile() const {
   return !m_process->IsLiveDebugSession();
 }
+
+// For our ELF/POSIX builds save off the fs_base/gs_base regions
+static void AddThreadLocalMemoryRegions(Process &process, ThreadSP &thread_sp,
+                                        std::vector<MemoryRegionInfo> &ranges) {
+  lldb::RegisterContextSP reg_ctx = thread_sp->GetRegisterContext();
+  if (!reg_ctx)
+    return;
+
+  const RegisterInfo *reg_info = reg_ctx->GetRegisterInfo(
+      lldb::RegisterKind::eRegisterKindGeneric, LLDB_REGNUM_GENERIC_TP);
+  if (!reg_info)
+    return;
+
+  lldb_private::RegisterValue thread_local_register_value;
+  bool success = reg_ctx->ReadRegister(reg_info, thread_local_register_value);
+  if (!success)
+    return;
+
+  const uint64_t fail_value = UINT64_MAX;
+  bool readSuccess = false;
+  const lldb::addr_t reg_value_addr =
+      thread_local_register_value.GetAsUInt64(fail_value, &readSuccess);
+  if (!readSuccess || reg_value_addr == fail_value)
+    return;
+
+  MemoryRegionInfo thread_local_region;
+  Status err = process.GetMemoryRegionInfo(reg_value_addr, thread_local_region);
+  if (err.Fail())
+    return;
+
+  ranges.push_back(thread_local_region);
+}
+
+// Save off the link map for core files.
+static void AddLinkMapSections(Process &process,
+                               std::vector<MemoryRegionInfo> &ranges) {
+  ModuleList &module_list = process.GetTarget().GetImages();
+  Target *target = &process.GetTarget();
+  for (size_t idx = 0; idx < module_list.GetSize(); idx++) {
+    ModuleSP module_sp = module_list.GetModuleAtIndex(idx);
+    if (!module_sp)
+      continue;
+
+    ObjectFile *obj = module_sp->GetObjectFile();
+    if (!obj)
+      continue;
+    Address addr = obj->GetImageInfoAddress(target);
+    addr_t load_addr = addr.GetLoadAddress(target);
+    if (load_addr == LLDB_INVALID_ADDRESS)
+      continue;
+
+    MemoryRegionInfo link_map_section;
+    Status err = process.GetMemoryRegionInfo(load_addr, link_map_section);
+    if (err.Fail())
+      continue;
+
+    ranges.push_back(link_map_section);
+  }
+}
+
+void DynamicLoaderPOSIXDYLD::CalculateDynamicSaveCoreRanges(
+    lldb_private::Process &process,
+    std::vector<lldb_private::MemoryRegionInfo> &ranges,
+    llvm::function_ref<bool(const lldb_private::Thread &)>
+        save_thread_predicate) {
+  ThreadList &thread_list = process.GetThreadList();
+  for (size_t idx = 0; idx < thread_list.GetSize(); idx++) {
+    ThreadSP thread_sp = thread_list.GetThreadAtIndex(idx);
+    if (!thread_sp)
+      continue;
+
+    if (!save_thread_predicate(*thread_sp))
+      continue;
+
+    AddThreadLocalMemoryRegions(process, thread_sp, ranges);
+  }
+
+  AddLinkMapSections(process, ranges);
+}
diff --git a/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.h b/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.h
index 4c92335602cdf4..bde334aaca40b4 100644
--- a/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.h
+++ b/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.h
@@ -60,6 +60,12 @@ class DynamicLoaderPOSIXDYLD : public lldb_private::DynamicLoader {
                                      lldb::addr_t base_addr,
                                      bool base_addr_is_offset) override;
 
+  void CalculateDynamicSaveCoreRanges(
+      lldb_private::Process &process,
+      std::vector<lldb_private::MemoryRegionInfo> &ranges,
+      llvm::function_ref<bool(const lldb_private::Thread &)>
+          save_thread_predicate) override;
+
 protected:
   /// Runtime linker rendezvous structure.
   DYLDRendezvous m_rendezvous;
diff --git a/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp b/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp
index 32ffba763c08e3..5ea3db23f114c4 100644
--- a/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp
+++ b/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp
@@ -21,11 +21,13 @@
 #include "lldb/Interpreter/CommandReturnObject.h"
 #include "lldb/Interpreter/OptionArgParser.h"
 #include "lldb/Interpreter/OptionGroupBoolean.h"
+#include "lldb/Target/DynamicLoader.h"
 #include "lldb/Target/JITLoaderList.h"
 #include "lldb/Target/MemoryRegionInfo.h"
 #include "lldb/Target/SectionLoadList.h"
 #include "lldb/Target/Target.h"
 #include "lldb/Target/UnixSignals.h"
+#include "lldb/Utility/DataBufferHeap.h"
 #include "lldb/Utility/LLDBAssert.h"
 #include "lldb/Utility/LLDBLog.h"
 #include "lldb/Utility/Log.h"
@@ -34,6 +36,7 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Threading.h"
 
+#include "Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.h"
 #include "Plugins/ObjectFile/Placeholder/ObjectFilePlaceholder.h"
 #include "Plugins/Process/Utility/StopInfoMachException.h"
 
@@ -333,6 +336,16 @@ ArchSpec ProcessMinidump::GetArchitecture() {
   return ArchSpec(triple);
 }
 
+DataExtractor ProcessMinidump::GetAuxvData() {
+  std::optional<llvm::ArrayRef<uint8_t>> auxv =
+      m_minidump_parser->GetStream(StreamType::LinuxAuxv);
+  if (!auxv)
+    return DataExtractor();
+
+  return DataExtractor(auxv->data(), auxv->size(), GetByteOrder(),
+                       GetAddressByteSize(), GetAddressByteSize());
+}
+
 void ProcessMinidump::BuildMemoryRegions() {
   if (m_memory_regions)
     return;
@@ -534,7 +547,12 @@ void ProcessMinidump::ReadModuleList() {
 
       module_sp = Module::CreateModuleFromObjectFile<ObjectFilePlaceholder>(
           module_spec, load_addr, load_size);
-      GetTarget().GetImages().Append(module_sp, true /* notify */);
+      // If we haven't loaded a main executable yet, set the first module to be
+      // main executable
+      if (!GetTarget().GetExecutableModule())
+        GetTarget().SetExecutableModule(module_sp);
+      else
+        GetTarget().GetImages().Append(module_sp, true /* notify */);
     }
 
     bool load_addr_changed = false;
diff --git a/lldb/source/Plugins/Process/minidump/ProcessMinidump.h b/lldb/source/Plugins/Process/minidump/ProcessMinidump.h
index f2ea0a2b61d14e..3d235670a33abc 100644
--- a/lldb/source/Plugins/Process/minidump/ProcessMinidump.h
+++ b/lldb/source/Plugins/Process/minidump/ProcessMinidump.h
@@ -53,12 +53,11 @@ class ProcessMinidump : public PostMortemProcess {
 
   Status DoLoadCore() override;
 
-  DynamicLoader *GetDynamicLoader() override { return nullptr; }
+  // Returns AUXV structure found in the core file
+  lldb_private::DataExtractor GetAuxvData() override;
 
   llvm::StringRef GetPluginName() override { return GetPluginNameStatic(); }
 
-  SystemRuntime *GetSystemRuntime() override { return nullptr; }
-
   Status DoDestroy() override;
 
   void RefreshStateAfterStop() override;
diff --git a/lldb/source/Plugins/Process/minidump/RegisterContextMinidump_x86_64.cpp b/lldb/source/Plugins/Process/minidump/RegisterContextMinidump_x86_64.cpp
index e879c493156593..f305d1b7031d82 100644
--- a/lldb/source/Plugins/Process/minidump/RegisterContextMinidump_x86_64.cpp
+++ b/lldb/source/Plugins/Process/minidump/RegisterContextMinidump_x86_64.cpp
@@ -44,6 +44,17 @@ static void writeRegister(const void *reg_src, uint8_t *context,
   memcpy(reg_dest.data(), reg_src, reg_dest.size());
 }
 
+// TODO: Fix the registers in this file!
+// writeRegister checks x86_64 registers without base registers. This causes
+// an overlap in the register enum values. So we were truncating fs_base.
+// We should standardize to the x86_64_with_base registers.
+static void writeBaseRegister(const void *reg_src, uint8_t *context,
+                              const RegisterInfo &reg) {
+  auto bytes = reg.mutable_data(context);
+  llvm::MutableArrayRef<uint8_t> reg_dest = bytes.take_front(8);
+  memcpy(reg_dest.data(), reg_src, reg_dest.size());
+}
+
 lldb::DataBufferSP lldb_private::minidump::ConvertMinidumpContext_x86_64(
     llvm::ArrayRef<uint8_t> source_data,
     RegisterInfoInterface *target_reg_interface) {
@@ -105,11 +116,12 @@ lldb::DataBufferSP lldb_private::minidump::ConvertMinidumpContext_x86_64(
     writeRegister(&context->r15, result_base, reg_info[lldb_r15_x86_64]);
   }
 
+  // See comment on base regsiter
   if ((context_flags & LLDBSpecificFlag) == LLDBSpecificFlag) {
-    writeRegister(&context->fs_base, result_base,
-                  reg_info[x86_64_with_base::lldb_fs_base]);
-    writeRegister(&context->gs_base, result_base,
-                  reg_info[x86_64_with_base::lldb_gs_base]);
+    writeBaseRegister(&context->fs_base, result_base,
+                      reg_info[x86_64_with_base::lldb_fs_base]);
+    writeBaseRegister(&context->gs_base, result_base,
+                      reg_info[x86_64_with_base::lldb_gs_base]);
   }
 
   // TODO parse the floating point registers
diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp
index aca08972811470..c009d17d3ba507 100644
--- a/lldb/source/Target/Process.cpp
+++ b/lldb/source/Target/Process.cpp
@@ -6528,6 +6528,29 @@ static void AddRegion(const MemoryRegionInfo &region, bool try_dirty_pages,
                 CreateCoreFileMemoryRange(region));
 }
 
+static void SaveDynamicLoaderSections(Process &process,
+                                      const SaveCoreOptions &options,
+                                      CoreFileMemoryRanges &ranges,
+                                      std::set<addr_t> &stack_ends) {
+  DynamicLoader *dyld = process.GetDynamicLoader();
+  if (!dyld)
+    return;
+
+  std::vector<MemoryRegionInfo> dynamic_loader_mem_regions;
+  std::function<bool(const lldb_private::Thread &)> save_thread_predicate =
+      [&](const lldb_private::Thread &t) -> bool {
+    return options.ShouldThreadBeSaved(t.GetID());
+  };
+  dyld->CalculateDynamicSaveCoreRanges(process, dynamic_loader_mem_regions,
+                                       save_thread_predicate);
+  for (const auto &region : dynamic_loader_mem_regions) {
+    // The Dynamic Loader can give us regions that could include a truncated
+    // stack
+    if (stack_ends.count(region.GetRange().GetRangeEnd()) == 0)
+      AddRegion(region, true, ranges);
+  }
+}
+
 static void SaveOffRegionsWithStackPointers(Process &process,
                                             const SaveCoreOptions &core_options,
                                             const MemoryRegionInfos &regions,
@@ -6559,11 +6582,13 @@ static void SaveOffRegionsWithStackPointers(Process &process,
       // off in other calls
       sp_region.GetRange().SetRangeBase(stack_head);
       sp_region.GetRange().SetByteSize(stack_size);
-      stack_ends.insert(sp_region.GetRange().GetRangeEnd());
+      const addr_t range_end = sp_region.GetRange().GetRangeEnd();
+      stack_ends.insert(range_end);
       // This will return true if the threadlist the user specified is empty,
       // or contains the thread id from thread_sp.
-      if (core_options.ShouldThreadBeSaved(thread_sp->GetID()))
+      if (core_options.ShouldThreadBeSaved(thread_sp->GetID())) {
         AddRegion(sp_region, try_dirty_pages, ranges);
+      }
     }
   }
 }
@@ -6672,9 +6697,14 @@ Status Process::CalculateCoreFileSaveRanges(const SaveCoreOptions &options,
   std::set<addr_t> stack_ends;
   // For fully custom set ups, we don't want to even look at threads if there
   // are no threads specified.
-  if (core_style != lldb::eSaveCoreCustomOnly || options.HasSpecifiedThreads())
+  if (core_style != lldb::eSaveCoreCustomOnly ||
+      options.HasSpecifiedThreads()) {
     SaveOffRegionsWithStackPointers(*this, options, regions, ranges,
                                     stack_ends);
+    // Save off the dynamic loader sections, so if we are on an architecture
+    // that supports Thread Locals, that we include those as well.
+    SaveDynamicLoaderSections(*this, options, ranges, stack_ends);
+  }
 
   switch (core_style) {
   case eSaveCoreUnspecified:
diff --git a/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py b/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py
index 03cc415924e0bb..4818dde4f3b838 100644
--- a/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py
+++ b/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py
@@ -523,8 +523,10 @@ def minidump_deleted_on_save_failure(self):
         finally:
             self.assertTrue(self.dbg.DeleteTarget(target))
 
-    def minidump_deterministic_difference(self):
-        """Test that verifies that two minidumps produced are identical."""
+    @skipUnlessPlatform(["linux"])
+    @skipUnlessArch("x86_64")
+    def minidump_saves_fs_base_region(self):
+        """Test that verifies the minidump file saves region for fs_base"""
 
         self.build()
         exe = self.getBuildArtifact("a.out")
@@ -534,6 +536,45 @@ def minidump_deterministic_difference(self):
                 None, None, self.get_process_working_directory()
             )
             self.assertState(process.GetState(), lldb.eStateStopped)
+            thread = process.GetThreadAtIndex(0)
+            custom_file = self.getBuildArtifact("core.reg_region.dmp")
+            options = lldb.SBSaveCoreOptions()
+            options.SetOutputFile(lldb.SBFileSpec(custom_file))
+            options.SetPluginName("minidump")
+            options.SetStyle(lldb.eSaveCoreCustomOnly)
+            options.AddThread(thread)
+            error = process.SaveCore(options)
+            self.assertTrue(error.Success())
+
+            registers = thread.GetFrameAtIndex(0).GetRegisters()
+            fs_base = registers.GetFirstValueByName("fs_base").GetValueAsUnsigned()
+            self.assertTrue(fs_base != 0)
+            core_target = self.dbg.CreateTarget(None)
+            core_proc = core_target.LoadCore(one_region_file)
+            core_region_list = core_proc.GetMemoryRegions()
+            live_region_list = process.GetMemoryRegions()
+            live_region = lldb.SBMemoryRegionInfo()
+            live_region_list.GetMemoryRegionForAddress(fs_base, live_region)
+            core_region = lldb.SBMemoryRegionInfo()
+            error = core_region_list.GetMemoryRegionForAddress(fs_base, core_region)
+            self.assertTrue(error.Success())
+            self.assertEqual(live_region, core_region)
+
+        finally:
+            self.assertTrue(self.dbg.DeleteTarget(target))
+            self.assertTrue(self.dbg.DeleteTarget(core_target))
+            if os.path.isfile(custom_file):
+                os.unlink(custom_file)
+
+    def minidump_deterministic_difference(self):
+        """Test that verifies that two minidumps produced are identical."""
+        self.build()
+        exe = self.getBuildArtifact("a.out")
+        try:
+            target = self.dbg.CreateTarget(exe)
+            process = target.LaunchSimple(
+                None, None, self.get_process_working_directory()
+            )
 
             core_styles = [
                 lldb.eSaveCoreStackOnly,
@@ -562,6 +603,36 @@ def minidump_deterministic_difference(self):
                 self.assertEqual(file_one, file_two)
                 self.assertTrue(os.unlink(spec_one.GetFileName()))
                 self.assertTrue(os.unlink(spec_two.GetFileName()))
-
         finally:
             self.assertTrue(self.dbg.DeleteTarget(target))
+
+    @skipUnlessPlatform(["linux"])
+    @skipUnlessArch("x86_64")
+    def minidump_saves_fs_base_region(self):
+        self.build()
+        exe = self.getBuildArtifact("a.out")
+        try:
+            target = self.dbg.CreateTarget(exe)
+            process = target.LaunchSimple(
+                None, None, self.get_process_working_directory()
+            )
+            self.assertState(process.GetState(), lldb.eStateStopped)
+            thread = process.GetThreadAtIndex(0)
+            tls_file = self.getBuildArtifact("core.tls.dmp")
+            options = lldb.SBSaveCoreOptions()
+            options.SetOutputFile(lldb.SBFileSpec(tls_file))
+            options.SetPluginName("minidump")
+            options.SetStyle(lldb.eSaveCoreCustomOnly)
+            options.AddThread(thread)
+            error = process.SaveCore(options)
+            self.assertTrue(error.Success())
+            core_target = self.dbg.CreateTarget(None)
+            core_proc = core_target.LoadCore(tls_file)
+            frame = core_proc.GetThreadAtIndex(0).GetFrameAtIndex(0)
+            tls_val = frame.FindValue("lf")
+            self.assertEqual(tls_val.GetValueAsUnsigned(), 42)
+
+        except:
+            self.assertTrue(self.dbg.DeleteTarget(target))
+            if os.path.isfile(tls_file):
+                os.unlink(tls_file)
diff --git a/lldb/test/API/functionalities/process_save_core_minidump/main.cpp b/lldb/test/API/functionalities/process_save_core_minidump/main.cpp
index fa34a371f20647..15daa68e9a648c 100644
--- a/lldb/test/API/functionalities/process_save_core_minidump/main.cpp
+++ b/lldb/test/API/functionalities/process_save_core_minidump/main.cpp
@@ -1,6 +1,7 @@
 #include <cassert>
 #include <iostream>
 #include <thread>
+thread_local size_t lf = 42;
 
 void g() { assert(false); }
 

From 4f297566b3150097de26c6a23a987d2bd5fc19c5 Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <robert@ocallahan.org>
Date: Fri, 11 Oct 2024 09:01:47 +1300
Subject: [PATCH 091/177] [lldb] Implement basic support for reverse-continue
 (#99736)

This commit only adds support for the
`SBProcess::ReverseContinue()` API. A user-accessible command for this
will follow in a later commit.

This feature depends on a gdbserver implementation (e.g. `rr`) providing
support for the `bc` and `bs` packets. `lldb-server` does not support
those packets, and there is no plan to change that. So, for testing
purposes, `lldbreverse.py` wraps `lldb-server` with a Python
implementation of *very limited* record-and-replay functionality for use
by *tests only*.

The majority of this PR is test infrastructure (about 700 of the 950
lines added).
---
 lldb/include/lldb/API/SBProcess.h             |   1 +
 lldb/include/lldb/Target/Process.h            |  21 +-
 lldb/include/lldb/Target/StopInfo.h           |   6 +
 lldb/include/lldb/lldb-enumerations.h         |   6 +
 .../Python/lldbsuite/test/gdbclientutils.py   |   5 +-
 .../Python/lldbsuite/test/lldbgdbproxy.py     | 175 ++++++++
 .../Python/lldbsuite/test/lldbreverse.py      | 418 ++++++++++++++++++
 .../Python/lldbsuite/test/lldbtest.py         |   2 +
 lldb/source/API/SBProcess.cpp                 |   8 +-
 lldb/source/API/SBThread.cpp                  |   2 +
 .../source/Interpreter/CommandInterpreter.cpp |   3 +-
 .../Process/Linux/NativeThreadLinux.cpp       |   3 +
 .../Process/MacOSX-Kernel/ProcessKDP.cpp      |   9 +-
 .../Process/MacOSX-Kernel/ProcessKDP.h        |   2 +-
 .../Process/Windows/Common/ProcessWindows.cpp |   8 +-
 .../Process/Windows/Common/ProcessWindows.h   |   2 +-
 .../GDBRemoteCommunicationClient.cpp          |  22 +
 .../gdb-remote/GDBRemoteCommunicationClient.h |   6 +
 .../GDBRemoteCommunicationServerLLGS.cpp      |   1 +
 .../Process/gdb-remote/ProcessGDBRemote.cpp   |  77 +++-
 .../Process/gdb-remote/ProcessGDBRemote.h     |   2 +-
 .../Process/scripted/ScriptedProcess.cpp      |   9 +-
 .../Process/scripted/ScriptedProcess.h        |   2 +-
 lldb/source/Target/Process.cpp                |  29 +-
 lldb/source/Target/StopInfo.cpp               |  29 ++
 lldb/source/Target/Thread.cpp                 |   8 +-
 .../reverse-execution/Makefile                |   3 +
 .../TestReverseContinueBreakpoints.py         | 115 +++++
 .../TestReverseContinueNotSupported.py        |  30 ++
 .../functionalities/reverse-execution/main.c  |  14 +
 lldb/tools/lldb-dap/JSONUtils.cpp             |   3 +
 lldb/tools/lldb-dap/LLDBUtils.cpp             |   1 +
 32 files changed, 978 insertions(+), 44 deletions(-)
 create mode 100644 lldb/packages/Python/lldbsuite/test/lldbgdbproxy.py
 create mode 100644 lldb/packages/Python/lldbsuite/test/lldbreverse.py
 create mode 100644 lldb/test/API/functionalities/reverse-execution/Makefile
 create mode 100644 lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py
 create mode 100644 lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py
 create mode 100644 lldb/test/API/functionalities/reverse-execution/main.c

diff --git a/lldb/include/lldb/API/SBProcess.h b/lldb/include/lldb/API/SBProcess.h
index 1624e02070b1b2..8b8ed830b54cc0 100644
--- a/lldb/include/lldb/API/SBProcess.h
+++ b/lldb/include/lldb/API/SBProcess.h
@@ -159,6 +159,7 @@ class LLDB_API SBProcess {
   lldb::SBError Destroy();
 
   lldb::SBError Continue();
+  lldb::SBError Continue(RunDirection direction);
 
   lldb::SBError Stop();
 
diff --git a/lldb/include/lldb/Target/Process.h b/lldb/include/lldb/Target/Process.h
index b8c53a474ba6b9..fe7fbc50fd5770 100644
--- a/lldb/include/lldb/Target/Process.h
+++ b/lldb/include/lldb/Target/Process.h
@@ -857,10 +857,10 @@ class Process : public std::enable_shared_from_this<Process>,
   /// \see Thread:Resume()
   /// \see Thread:Step()
   /// \see Thread:Suspend()
-  Status Resume();
+  Status Resume(lldb::RunDirection direction = lldb::eRunForward);
 
   /// Resume a process, and wait for it to stop.
-  Status ResumeSynchronous(Stream *stream);
+  Status ResumeSynchronous(Stream *stream, lldb::RunDirection direction = lldb::eRunForward);
 
   /// Halts a running process.
   ///
@@ -1104,9 +1104,14 @@ class Process : public std::enable_shared_from_this<Process>,
   /// \see Thread:Resume()
   /// \see Thread:Step()
   /// \see Thread:Suspend()
-  virtual Status DoResume() {
-    return Status::FromErrorStringWithFormatv(
-        "error: {0} does not support resuming processes", GetPluginName());
+  virtual Status DoResume(lldb::RunDirection direction) {
+    if (direction == lldb::RunDirection::eRunForward) {
+      return Status::FromErrorStringWithFormatv(
+          "error: {0} does not support resuming processes", GetPluginName());
+    } else {
+      return Status::FromErrorStringWithFormatv(
+          "error: {0} does not support reverse execution of processes", GetPluginName());
+    }
   }
 
   /// Called after resuming a process.
@@ -2332,6 +2337,8 @@ class Process : public std::enable_shared_from_this<Process>,
 
   bool IsRunning() const;
 
+  lldb::RunDirection GetLastRunDirection() { return m_last_run_direction; }
+
   DynamicCheckerFunctions *GetDynamicCheckers() {
     return m_dynamic_checkers_up.get();
   }
@@ -2851,7 +2858,7 @@ void PruneThreadPlans();
   ///
   /// \return
   ///     An Status object describing the success or failure of the resume.
-  Status PrivateResume();
+  Status PrivateResume(lldb::RunDirection direction = lldb::eRunForward);
 
   // Called internally
   void CompleteAttach();
@@ -3127,6 +3134,8 @@ void PruneThreadPlans();
                            // m_currently_handling_do_on_removals are true,
                            // Resume will only request a resume, using this
                            // flag to check.
+  // The direction of execution from the last time this process was resumed.
+  lldb::RunDirection m_last_run_direction;
 
   lldb::tid_t m_interrupt_tid; /// The tid of the thread that issued the async
                                /// interrupt, used by thread plan timeout. It
diff --git a/lldb/include/lldb/Target/StopInfo.h b/lldb/include/lldb/Target/StopInfo.h
index fae90364deaf0a..072f71f6b1122f 100644
--- a/lldb/include/lldb/Target/StopInfo.h
+++ b/lldb/include/lldb/Target/StopInfo.h
@@ -142,6 +142,12 @@ class StopInfo : public std::enable_shared_from_this<StopInfo> {
   static lldb::StopInfoSP
   CreateStopReasonProcessorTrace(Thread &thread, const char *description);
 
+  // This creates a StopInfo indicating that execution stopped because
+  // it was replaying some recorded execution history, and execution reached
+  // the end of that recorded history.
+  static lldb::StopInfoSP
+  CreateStopReasonHistoryBoundary(Thread &thread, const char *description);
+
   static lldb::StopInfoSP CreateStopReasonFork(Thread &thread,
                                                lldb::pid_t child_pid,
                                                lldb::tid_t child_tid);
diff --git a/lldb/include/lldb/lldb-enumerations.h b/lldb/include/lldb/lldb-enumerations.h
index 938f6e3abe8f2a..232d1dfdb5c9d0 100644
--- a/lldb/include/lldb/lldb-enumerations.h
+++ b/lldb/include/lldb/lldb-enumerations.h
@@ -135,6 +135,9 @@ FLAGS_ENUM(LaunchFlags){
 /// Thread Run Modes.
 enum RunMode { eOnlyThisThread, eAllThreads, eOnlyDuringStepping };
 
+/// Execution directions
+enum RunDirection { eRunForward, eRunReverse };
+
 /// Byte ordering definitions.
 enum ByteOrder {
   eByteOrderInvalid = 0,
@@ -254,6 +257,9 @@ enum StopReason {
   eStopReasonVFork,
   eStopReasonVForkDone,
   eStopReasonInterrupt, ///< Thread requested interrupt
+  // Indicates that execution stopped because the debugger backend relies
+  // on recorded data and we reached the end of that data.
+  eStopReasonHistoryBoundary,
 };
 
 /// Command Return Status Types.
diff --git a/lldb/packages/Python/lldbsuite/test/gdbclientutils.py b/lldb/packages/Python/lldbsuite/test/gdbclientutils.py
index 1784487323ad6b..732d6171320680 100644
--- a/lldb/packages/Python/lldbsuite/test/gdbclientutils.py
+++ b/lldb/packages/Python/lldbsuite/test/gdbclientutils.py
@@ -510,8 +510,9 @@ def start(self):
         self._thread.start()
 
     def stop(self):
-        self._thread.join()
-        self._thread = None
+        if self._thread is not None:
+            self._thread.join()
+            self._thread = None
 
     def get_connect_address(self):
         return self._socket.get_connect_address()
diff --git a/lldb/packages/Python/lldbsuite/test/lldbgdbproxy.py b/lldb/packages/Python/lldbsuite/test/lldbgdbproxy.py
new file mode 100644
index 00000000000000..2a9592bf4545a4
--- /dev/null
+++ b/lldb/packages/Python/lldbsuite/test/lldbgdbproxy.py
@@ -0,0 +1,175 @@
+import logging
+import os
+import os.path
+import random
+
+import lldb
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test.gdbclientutils import *
+import lldbgdbserverutils
+from lldbsuite.support import seven
+
+
+class GDBProxyTestBase(TestBase):
+    """
+    Base class for gdbserver proxy tests.
+
+    This class will setup and start a mock GDB server for the test to use.
+    It pases through requests to a regular lldb-server/debugserver and
+    forwards replies back to the LLDB under test.
+    """
+
+    """The gdbserver that we implement."""
+    server = None
+    """The inner lldb-server/debugserver process that we proxy requests into."""
+    monitor_server = None
+    monitor_sock = None
+
+    server_socket_class = TCPServerSocket
+
+    DEFAULT_TIMEOUT = 20 * (10 if ("ASAN_OPTIONS" in os.environ) else 1)
+
+    _verbose_log_handler = None
+    _log_formatter = logging.Formatter(fmt="%(asctime)-15s %(levelname)-8s %(message)s")
+
+    def setUpBaseLogging(self):
+        self.logger = logging.getLogger(__name__)
+
+        if len(self.logger.handlers) > 0:
+            return  # We have set up this handler already
+
+        self.logger.propagate = False
+        self.logger.setLevel(logging.DEBUG)
+
+        # log all warnings to stderr
+        handler = logging.StreamHandler()
+        handler.setLevel(logging.WARNING)
+        handler.setFormatter(self._log_formatter)
+        self.logger.addHandler(handler)
+
+    def setUp(self):
+        TestBase.setUp(self)
+
+        self.setUpBaseLogging()
+
+        if self.isVerboseLoggingRequested():
+            # If requested, full logs go to a log file
+            log_file_name = self.getLogBasenameForCurrentTest() + "-proxy.log"
+            self._verbose_log_handler = logging.FileHandler(
+               log_file_name
+            )
+            self._verbose_log_handler.setFormatter(self._log_formatter)
+            self._verbose_log_handler.setLevel(logging.DEBUG)
+            self.logger.addHandler(self._verbose_log_handler)
+
+        lldb_server_exe = lldbgdbserverutils.get_lldb_server_exe()
+        if lldb_server_exe is None:
+            self.debug_monitor_exe = lldbgdbserverutils.get_debugserver_exe()
+            self.assertTrue(self.debug_monitor_exe is not None)
+            self.debug_monitor_extra_args = []
+        else:
+            self.debug_monitor_exe = lldb_server_exe
+            self.debug_monitor_extra_args = ["gdbserver"]
+
+        self.server = MockGDBServer(self.server_socket_class())
+        self.server.responder = self
+
+    def tearDown(self):
+        # TestBase.tearDown will kill the process, but we need to kill it early
+        # so its client connection closes and we can stop the server before
+        # finally calling the base tearDown.
+        if self.process() is not None:
+            self.process().Kill()
+        self.server.stop()
+
+        self.logger.removeHandler(self._verbose_log_handler)
+        self._verbose_log_handler = None
+
+        TestBase.tearDown(self)
+
+    def isVerboseLoggingRequested(self):
+        # We will report our detailed logs if the user requested that the "gdb-remote" channel is
+        # logged.
+        return any(("gdb-remote" in channel) for channel in lldbtest_config.channels)
+
+    def connect(self, target):
+        """
+        Create a process by connecting to the mock GDB server.
+        """
+        self.prep_debug_monitor_and_inferior()
+        self.server.start()
+
+        listener = self.dbg.GetListener()
+        error = lldb.SBError()
+        process = target.ConnectRemote(
+            listener, self.server.get_connect_url(), "gdb-remote", error
+        )
+        self.assertTrue(error.Success(), error.description)
+        self.assertTrue(process, PROCESS_IS_VALID)
+        return process
+
+    def get_next_port(self):
+        return 12000 + random.randint(0, 3999)
+
+    def prep_debug_monitor_and_inferior(self):
+        inferior_exe_path = self.getBuildArtifact("a.out")
+        self.connect_to_debug_monitor([inferior_exe_path])
+        self.assertIsNotNone(self.monitor_server)
+        self.initial_handshake()
+
+    def initial_handshake(self):
+        self.monitor_server.send_packet(seven.bitcast_to_bytes("+"))
+        reply = seven.bitcast_to_string(self.monitor_server.get_normal_packet())
+        self.assertEqual(reply, "+")
+        self.monitor_server.send_packet(seven.bitcast_to_bytes("QStartNoAckMode"))
+        reply = seven.bitcast_to_string(self.monitor_server.get_normal_packet())
+        self.assertEqual(reply, "+")
+        reply = seven.bitcast_to_string(self.monitor_server.get_normal_packet())
+        self.assertEqual(reply, "OK")
+        self.monitor_server.send_packet(seven.bitcast_to_bytes("+"))
+        reply = seven.bitcast_to_string(self.monitor_server.get_normal_packet())
+        self.assertEqual(reply, "+")
+
+    def get_debug_monitor_command_line_args(self, connect_address, launch_args):
+        return self.debug_monitor_extra_args + ["--reverse-connect", connect_address] + launch_args
+
+    def launch_debug_monitor(self, launch_args):
+        family, type, proto, _, addr = socket.getaddrinfo(
+            "localhost", 0, proto=socket.IPPROTO_TCP
+        )[0]
+        sock = socket.socket(family, type, proto)
+        sock.settimeout(self.DEFAULT_TIMEOUT)
+        sock.bind(addr)
+        sock.listen(1)
+        addr = sock.getsockname()
+        connect_address = "[{}]:{}".format(*addr)
+
+        commandline_args = self.get_debug_monitor_command_line_args(
+            connect_address, launch_args
+        )
+
+        # Start the server.
+        self.logger.info(f"Spawning monitor {commandline_args}")
+        monitor_process = self.spawnSubprocess(
+            self.debug_monitor_exe, commandline_args, install_remote=False
+        )
+        self.assertIsNotNone(monitor_process)
+
+        self.monitor_sock = sock.accept()[0]
+        self.monitor_sock.settimeout(self.DEFAULT_TIMEOUT)
+        return monitor_process
+
+    def connect_to_debug_monitor(self, launch_args):
+        monitor_process = self.launch_debug_monitor(launch_args)
+        self.monitor_server = lldbgdbserverutils.Server(self.monitor_sock, monitor_process)
+
+    def respond(self, packet):
+        """Subclasses can override this to change how packets are handled."""
+        return self.pass_through(packet)
+
+    def pass_through(self, packet):
+        self.logger.info(f"Sending packet {packet}")
+        self.monitor_server.send_packet(seven.bitcast_to_bytes(packet))
+        reply = seven.bitcast_to_string(self.monitor_server.get_normal_packet())
+        self.logger.info(f"Received reply {reply}")
+        return reply
diff --git a/lldb/packages/Python/lldbsuite/test/lldbreverse.py b/lldb/packages/Python/lldbsuite/test/lldbreverse.py
new file mode 100644
index 00000000000000..0f02fdffbdeada
--- /dev/null
+++ b/lldb/packages/Python/lldbsuite/test/lldbreverse.py
@@ -0,0 +1,418 @@
+import os
+import os.path
+import lldb
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test.gdbclientutils import *
+from lldbsuite.test.lldbgdbproxy import *
+import lldbgdbserverutils
+import re
+
+
+class ThreadSnapshot:
+    def __init__(self, thread_id, registers):
+        self.thread_id = thread_id
+        self.registers = registers
+
+
+class MemoryBlockSnapshot:
+    def __init__(self, address, data):
+        self.address = address
+        self.data = data
+
+
+class StateSnapshot:
+    def __init__(self, thread_snapshots, memory):
+        self.thread_snapshots = thread_snapshots
+        self.memory = memory
+        self.thread_id = None
+
+
+class RegisterInfo:
+    def __init__(self, lldb_index, bitsize, little_endian):
+        self.lldb_index = lldb_index
+        self.bitsize = bitsize
+        self.little_endian = little_endian
+
+
+BELOW_STACK_POINTER = 16384
+ABOVE_STACK_POINTER = 4096
+
+BLOCK_SIZE = 1024
+
+SOFTWARE_BREAKPOINTS = 0
+HARDWARE_BREAKPOINTS = 1
+WRITE_WATCHPOINTS = 2
+
+
+class ReverseTestBase(GDBProxyTestBase):
+    """
+    Base class for tests that need reverse execution.
+
+    This class uses a gdbserver proxy to add very limited reverse-
+    execution capability to lldb-server/debugserver for testing
+    purposes only.
+
+    To use this class, run the inferior forward until some stopping point.
+    Then call `start_recording()` and execute forward again until reaching
+    a software breakpoint; this class records the state before each execution executes.
+    At that point, the server will accept "bc" and "bs" packets to step
+    backwards through the state.
+    When executing during recording, we only allow single-step and continue without
+    delivering a signal, and only software breakpoint stops are allowed.
+
+    We assume that while recording is enabled, the only effects of instructions
+    are on general-purpose registers (read/written by the 'g' and 'G' packets)
+    and on memory bytes between [SP - BELOW_STACK_POINTER, SP + ABOVE_STACK_POINTER).
+    """
+
+    """
+    A list of StateSnapshots in time order.
+
+    There is one snapshot per single-stepped instruction,
+    representing the state before that instruction was
+    executed. The last snapshot in the list is the
+    snapshot before the last instruction was executed.
+    This is an undo log; we snapshot a superset of the state that may have
+    been changed by the instruction's execution.
+    """
+    snapshots = None
+    recording_enabled = False
+
+    breakpoints = None
+
+    pid = None
+
+    pc_register_info = None
+    sp_register_info = None
+    general_purpose_register_info = None
+
+    def __init__(self, *args, **kwargs):
+        GDBProxyTestBase.__init__(self, *args, **kwargs)
+        self.breakpoints = [set(), set(), set(), set(), set()]
+
+    def respond(self, packet):
+        if not packet:
+            raise ValueError("Invalid empty packet")
+        if packet == self.server.PACKET_INTERRUPT:
+            # Don't send a response. We'll just run to completion.
+            return []
+        if self.is_command(packet, "qSupported", ":"):
+            reply = self.pass_through(packet)
+            return reply + ";ReverseStep+;ReverseContinue+"
+        if self.is_command(packet, "vCont", ";"):
+            if self.recording_enabled:
+                return self.continue_with_recording(packet)
+            snapshots = []
+        if packet[0] == "c" or packet[0] == "s" or packet[0] == "C" or packet[0] == "S":
+            raise ValueError("LLDB should not be sending old-style continuation packets")
+        if packet == "bc":
+            return self.reverse_continue()
+        if packet == "bs":
+            return self.reverse_step()
+        if packet == 'jThreadsInfo':
+            # Suppress this because it contains thread stop reasons which we might
+            # need to modify, and we don't want to have to implement that.
+            return ""
+        if packet[0] == "z" or packet[0] == "Z":
+            reply = self.pass_through(packet)
+            if reply == "OK":
+                self.update_breakpoints(packet)
+            return reply
+        return GDBProxyTestBase.respond(self, packet)
+
+    def start_recording(self):
+        self.recording_enabled = True
+        self.snapshots = []
+
+    def stop_recording(self):
+        """
+        Don't record when executing foward.
+
+        Reverse execution is still supported until the next forward continue.
+        """
+        self.recording_enabled = False
+
+    def is_command(self, packet, cmd, follow_token):
+        return packet == cmd or packet[0:len(cmd) + 1] == cmd + follow_token
+
+    def update_breakpoints(self, packet):
+        m = re.match("([zZ])([01234]),([0-9a-f]+),([0-9a-f]+)", packet)
+        if m is None:
+            raise ValueError("Invalid breakpoint packet: " + packet)
+        t = int(m.group(2))
+        addr = int(m.group(3), 16)
+        kind = int(m.group(4), 16)
+        if m.group(1) == 'Z':
+            self.breakpoints[t].add((addr, kind))
+        else:
+            self.breakpoints[t].discard((addr, kind))
+
+    def breakpoint_triggered_at(self, pc):
+        if any(addr == pc for addr, kind in self.breakpoints[SOFTWARE_BREAKPOINTS]):
+            return True
+        if any(addr == pc for addr, kind in self.breakpoints[HARDWARE_BREAKPOINTS]):
+            return True
+        return False
+
+    def watchpoint_triggered(self, new_value_block, current_contents):
+        """Returns the address or None."""
+        for watch_addr, kind in breakpoints[WRITE_WATCHPOINTS]:
+            for offset in range(0, kind):
+                addr = watch_addr + offset
+                if (addr >= new_value_block.address and
+                    addr < new_value_block.address + len(new_value_block.data)):
+                    index = addr - new_value_block.address
+                    if new_value_block.data[index*2:(index + 1)*2] != current_contents[index*2:(index + 1)*2]:
+                        return watch_addr
+        return None
+
+    def continue_with_recording(self, packet):
+        self.logger.debug("Continue with recording enabled")
+
+        step_packet = "vCont;s"
+        if packet == "vCont":
+            requested_step = False
+        else:
+            m = re.match("vCont;(c|s)(.*)", packet)
+            if m is None:
+                raise ValueError("Unsupported vCont packet: " + packet)
+            requested_step = m.group(1) == 's'
+            step_packet += m.group(2)
+
+        while True:
+            snapshot = self.capture_snapshot()
+            reply = self.pass_through(step_packet)
+            (stop_signal, stop_pairs) = self.parse_stop(reply)
+            if stop_signal != 5:
+                raise ValueError("Unexpected stop signal: " + reply)
+            is_swbreak = False
+            thread_id = None
+            for key, value in stop_pairs.items():
+                if key == "thread":
+                    thread_id = self.parse_thread_id(value)
+                    continue
+                if re.match('[0-9a-f]+', key):
+                    continue
+                if key == "swbreak" or (key == "reason" and value == "breakpoint"):
+                    is_swbreak = True
+                    continue
+                if key in ["name", "threads", "thread-pcs", "reason"]:
+                    continue
+                raise ValueError(f"Unknown stop key '{key}' in {reply}")
+            if is_swbreak:
+                self.logger.debug("Recording stopped")
+                return reply
+            if thread_id is None:
+                return ValueError("Expected thread ID: " + reply)
+            snapshot.thread_id = thread_id
+            self.snapshots.append(snapshot)
+            if requested_step:
+                self.logger.debug("Recording stopped for step")
+                return reply
+
+    def parse_stop(self, reply):
+        result = {}
+        if not reply:
+            raise ValueError("Invalid empty packet")
+        if reply[0] == "T" and len(reply) >= 3:
+            result = {k:v for k, v in self.parse_pairs(reply[3:])}
+            return (int(reply[1:3], 16), result)
+        raise "Unsupported stop reply: " + reply
+
+    def parse_pairs(self, text):
+        for pair in text.split(";"):
+            if not pair:
+                continue
+            m = re.match("([^:]+):(.*)", pair)
+            if m is None:
+                raise ValueError("Invalid pair text: " + text)
+            yield (m.group(1), m.group(2))
+
+    def capture_snapshot(self):
+        """Snapshot all threads and their stack memories."""
+        self.ensure_register_info()
+        current_thread = self.get_current_thread()
+        thread_snapshots = []
+        memory = []
+        for thread_id in self.get_thread_list():
+            registers = {}
+            for index in sorted(self.general_purpose_register_info.keys()):
+                reply =  self.pass_through(f"p{index:x};thread:{thread_id:x};")
+                if reply == "" or reply[0] == 'E':
+                    raise ValueError("Can't read register")
+                registers[index] = reply
+            thread_snapshot = ThreadSnapshot(thread_id, registers)
+            thread_sp = self.get_register(self.sp_register_info, thread_snapshot.registers)
+            memory += self.read_memory(thread_sp - BELOW_STACK_POINTER, thread_sp + ABOVE_STACK_POINTER)
+            thread_snapshots.append(thread_snapshot)
+        self.set_current_thread(current_thread)
+        return StateSnapshot(thread_snapshots, memory)
+
+    def restore_snapshot(self, snapshot):
+        """
+        Restore the snapshot during reverse execution.
+
+        If this triggers a breakpoint or watchpoint, return the stop reply,
+        otherwise None.
+        """
+        current_thread = self.get_current_thread()
+        stop_reasons = []
+        for thread_snapshot in snapshot.thread_snapshots:
+            thread_id = thread_snapshot.thread_id
+            for lldb_index in sorted(thread_snapshot.registers.keys()):
+                data = thread_snapshot.registers[lldb_index]
+                reply = self.pass_through(f"P{lldb_index:x}={data};thread:{thread_id:x};")
+                if reply != "OK":
+                    raise ValueError("Can't restore thread register")
+            if thread_id == snapshot.thread_id:
+                new_pc = self.get_register(self.pc_register_info, thread_snapshot.registers)
+                if self.breakpoint_triggered_at(new_pc):
+                    stop_reasons.append([("reason", "breakpoint")])
+        self.set_current_thread(current_thread)
+        for block in snapshot.memory:
+            current_memory = self.pass_through(f"m{block.address:x},{(len(block.data)/2):x}")
+            if not current_memory or current_memory[0] == 'E':
+                raise ValueError("Can't read back memory")
+            reply = self.pass_through(f"M{block.address:x},{len(block.data)/2:x}:" + block.data)
+            if reply != "OK":
+                raise ValueError("Can't restore memory")
+            watch_addr = self.watchpoint_triggered(block, current_memory[1:])
+            if watch_addr is not None:
+                stop_reasons.append([("reason", "watchpoint"), ("watch", f"{watch_addr:x}")])
+        if stop_reasons:
+            pairs = ";".join(f"{key}:{value}" for key, value in stop_reasons[0])
+            return f"T05thread:{self.pid:x}.{snapshot.thread_id:x};{pairs};"
+        return None
+
+    def reverse_step(self):
+        if not self.snapshots:
+            self.logger.debug("Reverse-step at history boundary")
+            return self.history_boundary_reply(self.get_current_thread())
+        self.logger.debug("Reverse-step started")
+        snapshot = self.snapshots.pop()
+        stop_reply = self.restore_snapshot(snapshot)
+        self.set_current_thread(snapshot.thread_id)
+        self.logger.debug("Reverse-step stopped")
+        if stop_reply is None:
+            return self.singlestep_stop_reply(snapshot.thread_id)
+        return stop_reply
+
+    def reverse_continue(self):
+        self.logger.debug("Reverse-continue started")
+        thread_id = None
+        while self.snapshots:
+            snapshot = self.snapshots.pop()
+            stop_reply = self.restore_snapshot(snapshot)
+            thread_id = snapshot.thread_id
+            if stop_reply is not None:
+                self.set_current_thread(thread_id)
+                self.logger.debug("Reverse-continue stopped")
+                return stop_reply
+        if thread_id is None:
+            thread_id = self.get_current_thread()
+        else:
+            self.set_current_thread(snapshot.thread_id)
+        self.logger.debug("Reverse-continue stopped at history boundary")
+        return self.history_boundary_reply(thread_id)
+
+    def get_current_thread(self):
+        reply = self.pass_through("qC")
+        return self.parse_thread_id(reply[2:])
+
+    def parse_thread_id(self, thread_id):
+        m = re.match("(p([0-9a-f]+)[.])?([0-9a-f]+)$", thread_id)
+        if m is None:
+            raise ValueError("Invalid thread ID: " + thread_id)
+        if self.pid is None:
+            self.pid = int(m.group(2), 16)
+        return int(m.group(3), 16)
+
+    def history_boundary_reply(self, thread_id):
+        return f"T00thread:{self.pid:x}.{thread_id:x};replaylog:begin;"
+
+    def singlestep_stop_reply(self, thread_id):
+        return f"T05thread:{self.pid:x}.{thread_id:x};"
+
+    def set_current_thread(self, thread_id):
+        """
+        Set current thread in inner gdbserver.
+        """
+        if thread_id >= 0:
+            self.pass_through(f"Hg{self.pid:x}.{thread_id:x}")
+            self.pass_through(f"Hc{self.pid:x}.{thread_id:x}")
+        else:
+            self.pass_through(f"Hc-1.-1")
+            self.pass_through(f"Hg-1.-1")
+
+    def get_register(self, register_info, registers):
+        if register_info.bitsize % 8 != 0:
+            raise ValueError("Register size must be a multiple of 8 bits")
+        if register_info.lldb_index not in registers:
+            raise ValueError("Register value not captured")
+        data = registers[register_info.lldb_index]
+        num_bytes = register_info.bitsize//8
+        bytes = []
+        for i in range(0, num_bytes):
+            bytes.append(int(data[i*2:(i + 1)*2], 16))
+        if register_info.little_endian:
+            bytes.reverse()
+        result = 0
+        for byte in bytes:
+            result = (result << 8) + byte
+        return result
+
+    def read_memory(self, start_addr, end_addr):
+        """
+        Read a region of memory from the target.
+
+        Some of the addresses may extend into invalid virtual memory;
+        skip those areas.
+        Return a list of blocks containing the valid area(s) in the
+        requested range.
+        """
+        regions = []
+        start_addr = start_addr & (BLOCK_SIZE - 1)
+        end_addr = (end_addr + BLOCK_SIZE - 1) & (BLOCK_SIZE - 1)
+        for addr in range(start_addr, end_addr, BLOCK_SIZE):
+            reply = self.pass_through(f"m{addr:x},{(BLOCK_SIZE - 1):x}")
+            if reply and reply[0] != 'E':
+                block = MemoryBlockSnapshot(addr, reply[1:])
+                regions.append(block)
+        return regions
+
+    def ensure_register_info(self):
+        if self.general_purpose_register_info is not None:
+            return
+        reply = self.pass_through("qHostInfo")
+        little_endian = any(kv == ("endian", "little") for kv in self.parse_pairs(reply))
+        self.general_purpose_register_info = {}
+        lldb_index = 0
+        while True:
+            reply = self.pass_through(f"qRegisterInfo{lldb_index:x}")
+            if not reply or reply[0] == 'E':
+                break
+            info = {k:v for k, v in self.parse_pairs(reply)}
+            reg_info = RegisterInfo(lldb_index, int(info["bitsize"]), little_endian)
+            if info["set"] == "General Purpose Registers" and not "container-regs" in info:
+                self.general_purpose_register_info[lldb_index] = reg_info
+            if "generic" in info:
+                if info["generic"] == "pc":
+                    self.pc_register_info = reg_info
+                elif info["generic"] == "sp":
+                    self.sp_register_info = reg_info
+            lldb_index += 1
+        if self.pc_register_info is None or self.sp_register_info is None:
+            raise ValueError("Can't find generic pc or sp register")
+
+    def get_thread_list(self):
+        threads = []
+        reply = self.pass_through("qfThreadInfo")
+        while True:
+            if not reply:
+                raise ValueError("Missing reply packet")
+            if reply[0] == 'm':
+                for id in reply[1:].split(","):
+                    threads.append(self.parse_thread_id(id))
+            elif reply[0] == 'l':
+                return threads
+            reply = self.pass_through("qsThreadInfo")
diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py
index 8884ef5933ada8..7cc1ac9749ec93 100644
--- a/lldb/packages/Python/lldbsuite/test/lldbtest.py
+++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py
@@ -143,6 +143,8 @@
 
 STOPPED_DUE_TO_WATCHPOINT = "Process should be stopped due to watchpoint"
 
+STOPPED_DUE_TO_HISTORY_BOUNDARY = "Process should be stopped due to history boundary"
+
 DATA_TYPES_DISPLAYED_CORRECTLY = "Data type(s) displayed correctly"
 
 VALID_BREAKPOINT = "Got a valid breakpoint"
diff --git a/lldb/source/API/SBProcess.cpp b/lldb/source/API/SBProcess.cpp
index 9773144723c34c..07780f9f9c8393 100644
--- a/lldb/source/API/SBProcess.cpp
+++ b/lldb/source/API/SBProcess.cpp
@@ -564,6 +564,10 @@ uint32_t SBProcess::GetAddressByteSize() const {
 }
 
 SBError SBProcess::Continue() {
+  return Continue(RunDirection::eRunForward);
+}
+
+SBError SBProcess::Continue(RunDirection direction) {
   LLDB_INSTRUMENT_VA(this);
 
   SBError sb_error;
@@ -574,9 +578,9 @@ SBError SBProcess::Continue() {
         process_sp->GetTarget().GetAPIMutex());
 
     if (process_sp->GetTarget().GetDebugger().GetAsyncExecution())
-      sb_error.ref() = process_sp->Resume();
+      sb_error.ref() = process_sp->Resume(direction);
     else
-      sb_error.ref() = process_sp->ResumeSynchronous(nullptr);
+      sb_error.ref() = process_sp->ResumeSynchronous(nullptr, direction);
   } else
     sb_error = Status::FromErrorString("SBProcess is invalid");
 
diff --git a/lldb/source/API/SBThread.cpp b/lldb/source/API/SBThread.cpp
index a99456e06d0329..aca8a039952960 100644
--- a/lldb/source/API/SBThread.cpp
+++ b/lldb/source/API/SBThread.cpp
@@ -172,6 +172,7 @@ size_t SBThread::GetStopReasonDataCount() {
         case eStopReasonInstrumentation:
         case eStopReasonProcessorTrace:
         case eStopReasonVForkDone:
+        case eStopReasonHistoryBoundary:
           // There is no data for these stop reasons.
           return 0;
 
@@ -233,6 +234,7 @@ uint64_t SBThread::GetStopReasonDataAtIndex(uint32_t idx) {
         case eStopReasonInstrumentation:
         case eStopReasonProcessorTrace:
         case eStopReasonVForkDone:
+        case eStopReasonHistoryBoundary:
           // There is no data for these stop reasons.
           return 0;
 
diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp
index 8d3a82ef6c990a..ea60492ac46a10 100644
--- a/lldb/source/Interpreter/CommandInterpreter.cpp
+++ b/lldb/source/Interpreter/CommandInterpreter.cpp
@@ -2553,7 +2553,8 @@ bool CommandInterpreter::DidProcessStopAbnormally() const {
     const StopReason reason = stop_info->GetStopReason();
     if (reason == eStopReasonException ||
         reason == eStopReasonInstrumentation ||
-        reason == eStopReasonProcessorTrace || reason == eStopReasonInterrupt)
+        reason == eStopReasonProcessorTrace || reason == eStopReasonInterrupt ||
+        reason == eStopReasonHistoryBoundary)
       return true;
 
     if (reason == eStopReasonSignal) {
diff --git a/lldb/source/Plugins/Process/Linux/NativeThreadLinux.cpp b/lldb/source/Plugins/Process/Linux/NativeThreadLinux.cpp
index de047ee214c11e..b0aa664775b463 100644
--- a/lldb/source/Plugins/Process/Linux/NativeThreadLinux.cpp
+++ b/lldb/source/Plugins/Process/Linux/NativeThreadLinux.cpp
@@ -82,6 +82,9 @@ void LogThreadStopInfo(Log &log, const ThreadStopInfo &stop_info,
   case eStopReasonProcessorTrace:
     log.Printf("%s: %s processor trace", __FUNCTION__, header);
     return;
+  case eStopReasonHistoryBoundary:
+    log.Printf("%s: %s history boundary", __FUNCTION__, header);
+    return;
   default:
     log.Printf("%s: %s invalid stop reason %" PRIu32, __FUNCTION__, header,
                static_cast<uint32_t>(stop_info.reason));
diff --git a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp
index 9b2907c6809965..116c43343c01d1 100644
--- a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp
+++ b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp
@@ -402,9 +402,16 @@ lldb_private::DynamicLoader *ProcessKDP::GetDynamicLoader() {
 
 Status ProcessKDP::WillResume() { return Status(); }
 
-Status ProcessKDP::DoResume() {
+Status ProcessKDP::DoResume(RunDirection direction) {
   Status error;
   Log *log = GetLog(KDPLog::Process);
+
+  if (direction == RunDirection::eRunReverse) {
+    error.SetErrorStringWithFormatv(
+        "error: {0} does not support reverse execution of processes", GetPluginName());
+    return error;
+  }
+
   // Only start the async thread if we try to do any process control
   if (!m_async_thread.IsJoinable())
     StartAsyncThread();
diff --git a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.h b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.h
index e5ec5914f9600d..1b71d83f70b087 100644
--- a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.h
+++ b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.h
@@ -90,7 +90,7 @@ class ProcessKDP : public lldb_private::Process {
   // Process Control
   lldb_private::Status WillResume() override;
 
-  lldb_private::Status DoResume() override;
+  lldb_private::Status DoResume(lldb::RunDirection direction) override;
 
   lldb_private::Status DoHalt(bool &caused_stop) override;
 
diff --git a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp
index 703aa082f0476f..76b7095deaa503 100644
--- a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp
+++ b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp
@@ -204,11 +204,17 @@ ProcessWindows::DoAttachToProcessWithID(lldb::pid_t pid,
   return error;
 }
 
-Status ProcessWindows::DoResume() {
+Status ProcessWindows::DoResume(RunDirection direction) {
   Log *log = GetLog(WindowsLog::Process);
   llvm::sys::ScopedLock lock(m_mutex);
   Status error;
 
+  if (direction == RunDirection::eRunReverse) {
+    error.SetErrorStringWithFormatv(
+        "error: {0} does not support reverse execution of processes", GetPluginName());
+    return error;
+  }
+
   StateType private_state = GetPrivateState();
   if (private_state == eStateStopped || private_state == eStateCrashed) {
     LLDB_LOG(log, "process {0} is in state {1}.  Resuming...",
diff --git a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.h b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.h
index e97cfb790248be..97284b7cd1436e 100644
--- a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.h
+++ b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.h
@@ -52,7 +52,7 @@ class ProcessWindows : public Process, public ProcessDebugger {
   Status DoAttachToProcessWithID(
       lldb::pid_t pid,
       const lldb_private::ProcessAttachInfo &attach_info) override;
-  Status DoResume() override;
+  Status DoResume(lldb::RunDirection direction) override;
   Status DoDestroy() override;
   Status DoHalt(bool &caused_stop) override;
 
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
index e42526c8fd7266..fc792a4409410b 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
@@ -199,6 +199,20 @@ uint64_t GDBRemoteCommunicationClient::GetRemoteMaxPacketSize() {
   return m_max_packet_size;
 }
 
+bool GDBRemoteCommunicationClient::GetReverseContinueSupported() {
+  if (m_supports_reverse_continue == eLazyBoolCalculate) {
+    GetRemoteQSupported();
+  }
+  return m_supports_reverse_continue == eLazyBoolYes;
+}
+
+bool GDBRemoteCommunicationClient::GetReverseStepSupported() {
+  if (m_supports_reverse_step == eLazyBoolCalculate) {
+    GetRemoteQSupported();
+  }
+  return m_supports_reverse_step == eLazyBoolYes;
+}
+
 bool GDBRemoteCommunicationClient::QueryNoAckModeSupported() {
   if (m_supports_not_sending_acks == eLazyBoolCalculate) {
     m_send_acks = true;
@@ -295,6 +309,8 @@ void GDBRemoteCommunicationClient::ResetDiscoverableSettings(bool did_exec) {
     m_supports_qXfer_siginfo_read = eLazyBoolCalculate;
     m_supports_augmented_libraries_svr4_read = eLazyBoolCalculate;
     m_uses_native_signals = eLazyBoolCalculate;
+    m_supports_reverse_continue = eLazyBoolCalculate;
+    m_supports_reverse_step = eLazyBoolCalculate;
     m_supports_qProcessInfoPID = true;
     m_supports_qfProcessInfo = true;
     m_supports_qUserName = true;
@@ -348,6 +364,8 @@ void GDBRemoteCommunicationClient::GetRemoteQSupported() {
   m_supports_memory_tagging = eLazyBoolNo;
   m_supports_qSaveCore = eLazyBoolNo;
   m_uses_native_signals = eLazyBoolNo;
+  m_supports_reverse_continue = eLazyBoolNo;
+  m_supports_reverse_step = eLazyBoolNo;
 
   m_max_packet_size = UINT64_MAX; // It's supposed to always be there, but if
                                   // not, we assume no limit
@@ -401,6 +419,10 @@ void GDBRemoteCommunicationClient::GetRemoteQSupported() {
         m_supports_qSaveCore = eLazyBoolYes;
       else if (x == "native-signals+")
         m_uses_native_signals = eLazyBoolYes;
+      else if (x == "ReverseContinue+")
+        m_supports_reverse_continue = eLazyBoolYes;
+      else if (x == "ReverseStep+")
+        m_supports_reverse_step = eLazyBoolYes;
       // Look for a list of compressions in the features list e.g.
       // qXfer:features:read+;PacketSize=20000;qEcho+;SupportedCompressions=zlib-
       // deflate,lzma
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h
index 898d176abc3465..116b47c1edf033 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h
@@ -331,6 +331,10 @@ class GDBRemoteCommunicationClient : public GDBRemoteClientBase {
 
   bool GetMultiprocessSupported();
 
+  bool GetReverseContinueSupported();
+
+  bool GetReverseStepSupported();
+
   LazyBool SupportsAllocDeallocMemory() // const
   {
     // Uncomment this to have lldb pretend the debug server doesn't respond to
@@ -561,6 +565,8 @@ class GDBRemoteCommunicationClient : public GDBRemoteClientBase {
   LazyBool m_supports_memory_tagging = eLazyBoolCalculate;
   LazyBool m_supports_qSaveCore = eLazyBoolCalculate;
   LazyBool m_uses_native_signals = eLazyBoolCalculate;
+  LazyBool m_supports_reverse_continue = eLazyBoolCalculate;
+  LazyBool m_supports_reverse_step = eLazyBoolCalculate;
 
   bool m_supports_qProcessInfoPID : 1, m_supports_qfProcessInfo : 1,
       m_supports_qUserName : 1, m_supports_qGroupName : 1,
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp
index 35fa93e53bc66f..4016cde74ebea8 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp
@@ -716,6 +716,7 @@ static const char *GetStopReasonString(StopReason stop_reason) {
     return "vforkdone";
   case eStopReasonInterrupt:
     return "async interrupt";
+  case eStopReasonHistoryBoundary:
   case eStopReasonInstrumentation:
   case eStopReasonInvalid:
   case eStopReasonPlanComplete:
diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
index 3e09c316d74f44..3fc03bd05d5df0 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
@@ -169,6 +169,10 @@ class PluginProperties : public Properties {
   }
 };
 
+std::chrono::seconds ResumeTimeout() {
+  return std::chrono::seconds(5);
+}
+
 } // namespace
 
 static PluginProperties &GetGlobalPluginProperties() {
@@ -1180,10 +1184,11 @@ Status ProcessGDBRemote::WillResume() {
   return Status();
 }
 
-Status ProcessGDBRemote::DoResume() {
+Status ProcessGDBRemote::DoResume(RunDirection direction) {
   Status error;
   Log *log = GetLog(GDBRLog::Process);
-  LLDB_LOGF(log, "ProcessGDBRemote::Resume()");
+  LLDB_LOGF(log, "ProcessGDBRemote::Resume(%s)",
+            direction == RunDirection::eRunForward ? "" : "reverse");
 
   ListenerSP listener_sp(
       Listener::MakeListener("gdb-remote.resume-packet-sent"));
@@ -1197,12 +1202,21 @@ Status ProcessGDBRemote::DoResume() {
 
     StreamString continue_packet;
     bool continue_packet_error = false;
-    if (m_gdb_comm.HasAnyVContSupport()) {
+    // Number of threads continuing with "c", i.e. continuing without a signal to deliver.
+    const size_t num_continue_c_tids = m_continue_c_tids.size();
+    // Number of threads continuing with "C", i.e. continuing with a signal to deliver.
+    const size_t num_continue_C_tids = m_continue_C_tids.size();
+    // Number of threads continuing with "s", i.e. single-stepping.
+    const size_t num_continue_s_tids = m_continue_s_tids.size();
+    // Number of threads continuing with "S", i.e. single-stepping with a signal to deliver.
+    const size_t num_continue_S_tids = m_continue_S_tids.size();
+    if (direction == RunDirection::eRunForward &&
+        m_gdb_comm.HasAnyVContSupport()) {
       std::string pid_prefix;
       if (m_gdb_comm.GetMultiprocessSupported())
         pid_prefix = llvm::formatv("p{0:x-}.", GetID());
 
-      if (m_continue_c_tids.size() == num_threads ||
+      if (num_continue_c_tids == num_threads ||
           (m_continue_c_tids.empty() && m_continue_C_tids.empty() &&
            m_continue_s_tids.empty() && m_continue_S_tids.empty())) {
         // All threads are continuing
@@ -1265,14 +1279,11 @@ Status ProcessGDBRemote::DoResume() {
     } else
       continue_packet_error = true;
 
-    if (continue_packet_error) {
+    if (direction == RunDirection::eRunForward && continue_packet_error) {
       // Either no vCont support, or we tried to use part of the vCont packet
-      // that wasn't supported by the remote GDB server. We need to try and
-      // make a simple packet that can do our continue
-      const size_t num_continue_c_tids = m_continue_c_tids.size();
-      const size_t num_continue_C_tids = m_continue_C_tids.size();
-      const size_t num_continue_s_tids = m_continue_s_tids.size();
-      const size_t num_continue_S_tids = m_continue_S_tids.size();
+      // that wasn't supported by the remote GDB server, or it's the reverse
+      // direction. We need to try and make a simple packet that can do our
+      // continue.
       if (num_continue_c_tids > 0) {
         if (num_continue_c_tids == num_threads) {
           // All threads are resuming...
@@ -1363,9 +1374,41 @@ Status ProcessGDBRemote::DoResume() {
       }
     }
 
+    if (direction == RunDirection::eRunReverse && continue_packet_error) {
+      if (num_continue_C_tids > 0 || num_continue_S_tids > 0) {
+        LLDB_LOGF(log, "ProcessGDBRemote::DoResumeReverse: Signals not supported");
+        return Status::FromErrorString("can't deliver signals while running in reverse");
+      }
+
+      if (num_continue_s_tids > 0) {
+        if (num_continue_s_tids > 1) {
+          LLDB_LOGF(log, "ProcessGDBRemote::DoResumeReverse: can't step multiple threads");
+          return Status::FromErrorString("can't step multiple threads while reverse-stepping");
+        }
+
+        if (!m_gdb_comm.GetReverseStepSupported()) {
+          LLDB_LOGF(log, "ProcessGDBRemote::DoResumeReverse: target does not support reverse-stepping");
+          return Status::FromErrorString("target does not support reverse-stepping");
+        }
+
+        m_gdb_comm.SetCurrentThreadForRun(m_continue_s_tids.front());
+        continue_packet.PutCString("bs");
+      } else {
+        if (!m_gdb_comm.GetReverseContinueSupported()) {
+          LLDB_LOGF(log, "ProcessGDBRemote::DoResumeReverse: target does not support reverse-continue");
+          return Status::FromErrorString("target does not support reverse-continue");
+        }
+
+        // All threads continue whether requested or not ---
+        // we can't change how threads ran in the past.
+        continue_packet.PutCString("bc");
+      }
+
+      continue_packet_error = false;
+    }
+
     if (continue_packet_error) {
-      error =
-          Status::FromErrorString("can't make continue packet for this resume");
+      return Status::FromErrorString("can't make continue packet for this resume");
     } else {
       EventSP event_sp;
       if (!m_async_thread.IsJoinable()) {
@@ -1380,7 +1423,7 @@ Status ProcessGDBRemote::DoResume() {
           std::make_shared<EventDataBytes>(continue_packet.GetString());
       m_async_broadcaster.BroadcastEvent(eBroadcastBitAsyncContinue, data_sp);
 
-      if (!listener_sp->GetEvent(event_sp, std::chrono::seconds(5))) {
+      if (!listener_sp->GetEvent(event_sp, ResumeTimeout())) {
         error = Status::FromErrorString("Resume timed out.");
         LLDB_LOGF(log, "ProcessGDBRemote::DoResume: Resume timed out.");
       } else if (event_sp->BroadcasterIs(&m_async_broadcaster)) {
@@ -1863,6 +1906,10 @@ ThreadSP ProcessGDBRemote::SetThreadStopInfo(
           thread_sp->SetStopInfo(StopInfo::CreateStopReasonWithException(
               *thread_sp, description.c_str()));
           handled = true;
+        } else if (reason == "replaylog") {
+          thread_sp->SetStopInfo(StopInfo::CreateStopReasonHistoryBoundary(
+              *thread_sp, description.c_str()));
+          handled = true;
         } else if (reason == "exec") {
           did_exec = true;
           thread_sp->SetStopInfo(
@@ -2318,6 +2365,8 @@ StateType ProcessGDBRemote::SetThreadStopInfo(StringExtractor &stop_packet) {
         description = std::string(ostr.GetString());
       } else if (key.compare("swbreak") == 0 || key.compare("hwbreak") == 0) {
         reason = "breakpoint";
+      } else if (key.compare("replaylog") == 0) {
+        reason = "replaylog";
       } else if (key.compare("library") == 0) {
         auto error = LoadModules();
         if (error) {
diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h
index 2492795851388a..fa3e1cec76e2b3 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h
@@ -111,7 +111,7 @@ class ProcessGDBRemote : public Process,
   // Process Control
   Status WillResume() override;
 
-  Status DoResume() override;
+  Status DoResume(lldb::RunDirection direction) override;
 
   Status DoHalt(bool &caused_stop) override;
 
diff --git a/lldb/source/Plugins/Process/scripted/ScriptedProcess.cpp b/lldb/source/Plugins/Process/scripted/ScriptedProcess.cpp
index d2111ce877ce55..304c12173dd35d 100644
--- a/lldb/source/Plugins/Process/scripted/ScriptedProcess.cpp
+++ b/lldb/source/Plugins/Process/scripted/ScriptedProcess.cpp
@@ -182,10 +182,15 @@ void ScriptedProcess::DidResume() {
   m_pid = GetInterface().GetProcessID();
 }
 
-Status ScriptedProcess::DoResume() {
+Status ScriptedProcess::DoResume(RunDirection direction) {
   LLDB_LOGF(GetLog(LLDBLog::Process), "ScriptedProcess::%s resuming process", __FUNCTION__);
 
-  return GetInterface().Resume();
+  if (direction == RunDirection::eRunForward) {
+    return GetInterface().Resume();
+  } else {
+    return Status::FromErrorStringWithFormatv(
+        "error: {0} does not support reverse execution of processes", GetPluginName());
+  }
 }
 
 Status ScriptedProcess::DoAttach(const ProcessAttachInfo &attach_info) {
diff --git a/lldb/source/Plugins/Process/scripted/ScriptedProcess.h b/lldb/source/Plugins/Process/scripted/ScriptedProcess.h
index 0335364b4010b2..8ebe4ca5f3d449 100644
--- a/lldb/source/Plugins/Process/scripted/ScriptedProcess.h
+++ b/lldb/source/Plugins/Process/scripted/ScriptedProcess.h
@@ -52,7 +52,7 @@ class ScriptedProcess : public Process {
 
   void DidResume() override;
 
-  Status DoResume() override;
+  Status DoResume(lldb::RunDirection direction) override;
 
   Status DoAttachToProcessWithID(lldb::pid_t pid,
                                  const ProcessAttachInfo &attach_info) override;
diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp
index c009d17d3ba507..fd683728388215 100644
--- a/lldb/source/Target/Process.cpp
+++ b/lldb/source/Target/Process.cpp
@@ -446,7 +446,8 @@ Process::Process(lldb::TargetSP target_sp, ListenerSP listener_sp,
       m_memory_cache(*this), m_allocated_memory_cache(*this),
       m_should_detach(false), m_next_event_action_up(), m_public_run_lock(),
       m_private_run_lock(), m_currently_handling_do_on_removals(false),
-      m_resume_requested(false), m_interrupt_tid(LLDB_INVALID_THREAD_ID),
+      m_resume_requested(false), m_last_run_direction(eRunForward),
+      m_interrupt_tid(LLDB_INVALID_THREAD_ID),
       m_finalizing(false), m_destructing(false),
       m_clear_thread_plans_on_stop(false), m_force_next_event_delivery(false),
       m_last_broadcast_state(eStateInvalid), m_destroy_in_process(false),
@@ -845,6 +846,7 @@ bool Process::HandleProcessStateChangedEvent(
             switch (thread_stop_reason) {
             case eStopReasonInvalid:
             case eStopReasonNone:
+            case eStopReasonHistoryBoundary:
               break;
 
             case eStopReasonSignal: {
@@ -1352,7 +1354,7 @@ void Process::SetPublicState(StateType new_state, bool restarted) {
   }
 }
 
-Status Process::Resume() {
+Status Process::Resume(RunDirection direction) {
   Log *log(GetLog(LLDBLog::State | LLDBLog::Process));
   LLDB_LOGF(log, "(plugin = %s) -- locking run lock", GetPluginName().data());
   if (!m_public_run_lock.TrySetRunning()) {
@@ -1361,7 +1363,7 @@ Status Process::Resume() {
     return Status::FromErrorString(
         "Resume request failed - process still running.");
   }
-  Status error = PrivateResume();
+  Status error = PrivateResume(direction);
   if (!error.Success()) {
     // Undo running state change
     m_public_run_lock.SetStopped();
@@ -1369,7 +1371,7 @@ Status Process::Resume() {
   return error;
 }
 
-Status Process::ResumeSynchronous(Stream *stream) {
+Status Process::ResumeSynchronous(Stream *stream, RunDirection direction) {
   Log *log(GetLog(LLDBLog::State | LLDBLog::Process));
   LLDB_LOGF(log, "Process::ResumeSynchronous -- locking run lock");
   if (!m_public_run_lock.TrySetRunning()) {
@@ -1382,7 +1384,7 @@ Status Process::ResumeSynchronous(Stream *stream) {
       Listener::MakeListener(ResumeSynchronousHijackListenerName.data()));
   HijackProcessEvents(listener_sp);
 
-  Status error = PrivateResume();
+  Status error = PrivateResume(direction);
   if (error.Success()) {
     StateType state =
         WaitForProcessToStop(std::nullopt, nullptr, true, listener_sp, stream,
@@ -3239,7 +3241,7 @@ Status Process::ConnectRemote(llvm::StringRef remote_url) {
   return error;
 }
 
-Status Process::PrivateResume() {
+Status Process::PrivateResume(RunDirection direction) {
   Log *log(GetLog(LLDBLog::Process | LLDBLog::Step));
   LLDB_LOGF(log,
             "Process::PrivateResume() m_stop_id = %u, public state: %s "
@@ -3255,6 +3257,15 @@ Status Process::PrivateResume() {
   if (!GetModID().IsLastResumeForUserExpression())
     ResetExtendedCrashInfoDict();
 
+  if (m_last_run_direction != direction) {
+    // In the future we might want to support mixed-direction plans,
+    // e.g. a forward step-over stops at a breakpoint, the user does
+    // a reverse-step, then disables the breakpoint and continues forward.
+    // This code will need to be changed to support that.
+    m_thread_list.DiscardThreadPlans();
+    m_last_run_direction = direction;
+  }
+
   Status error(WillResume());
   // Tell the process it is about to resume before the thread list
   if (error.Success()) {
@@ -3272,7 +3283,7 @@ Status Process::PrivateResume() {
             "Process::PrivateResume PreResumeActions failed, not resuming.");
       } else {
         m_mod_id.BumpResumeID();
-        error = DoResume();
+        error = DoResume(direction);
         if (error.Success()) {
           DidResume();
           m_thread_list.DidResume();
@@ -3735,7 +3746,7 @@ bool Process::ShouldBroadcastEvent(Event *event_ptr) {
                     "from state: %s",
                     static_cast<void *>(event_ptr), StateAsCString(state));
           ProcessEventData::SetRestartedInEvent(event_ptr, true);
-          PrivateResume();
+          PrivateResume(m_last_run_direction);
         }
       } else {
         return_value = true;
@@ -4346,7 +4357,7 @@ void Process::ProcessEventData::DoOnRemoval(Event *event_ptr) {
     SetRestarted(true);
     // Use the private resume method here, since we aren't changing the run
     // lock state.
-    process_sp->PrivateResume();
+    process_sp->PrivateResume(process_sp->m_last_run_direction);
   } else {
     bool hijacked = process_sp->IsHijackedForEvent(eBroadcastBitStateChanged) &&
                     !process_sp->StateChangedIsHijackedForSynchronousResume();
diff --git a/lldb/source/Target/StopInfo.cpp b/lldb/source/Target/StopInfo.cpp
index bd7032b803df90..08e9a7c099bad2 100644
--- a/lldb/source/Target/StopInfo.cpp
+++ b/lldb/source/Target/StopInfo.cpp
@@ -1212,6 +1212,30 @@ class StopInfoProcessorTrace : public StopInfo {
   }
 };
 
+// StopInfoHistoryBoundary
+
+class StopInfoHistoryBoundary : public StopInfo {
+public:
+  StopInfoHistoryBoundary(Thread &thread, const char *description)
+      : StopInfo(thread, LLDB_INVALID_UID) {
+    if (description)
+      SetDescription(description);
+  }
+
+  ~StopInfoHistoryBoundary() override = default;
+
+  StopReason GetStopReason() const override {
+    return eStopReasonHistoryBoundary;
+  }
+
+  const char *GetDescription() override {
+    if (m_description.empty())
+      return "history boundary";
+    else
+      return m_description.c_str();
+  }
+};
+
 // StopInfoThreadPlan
 
 class StopInfoThreadPlan : public StopInfo {
@@ -1439,6 +1463,11 @@ StopInfoSP StopInfo::CreateStopReasonProcessorTrace(Thread &thread,
   return StopInfoSP(new StopInfoProcessorTrace(thread, description));
 }
 
+StopInfoSP StopInfo::CreateStopReasonHistoryBoundary(Thread &thread,
+                                                     const char *description) {
+  return StopInfoSP(new StopInfoHistoryBoundary(thread, description));
+}
+
 StopInfoSP StopInfo::CreateStopReasonWithExec(Thread &thread) {
   return StopInfoSP(new StopInfoExec(thread));
 }
diff --git a/lldb/source/Target/Thread.cpp b/lldb/source/Target/Thread.cpp
index 902fbb2b519ef7..bbb586f033b746 100644
--- a/lldb/source/Target/Thread.cpp
+++ b/lldb/source/Target/Thread.cpp
@@ -624,10 +624,12 @@ void Thread::SetupForResume() {
     // what the current plan is.
 
     lldb::RegisterContextSP reg_ctx_sp(GetRegisterContext());
-    if (reg_ctx_sp) {
+    ProcessSP process_sp(GetProcess());
+    if (reg_ctx_sp && process_sp &&
+        process_sp->GetLastRunDirection() == eRunForward) {
       const addr_t thread_pc = reg_ctx_sp->GetPC();
       BreakpointSiteSP bp_site_sp =
-          GetProcess()->GetBreakpointSiteList().FindByAddress(thread_pc);
+          process_sp->GetBreakpointSiteList().FindByAddress(thread_pc);
       if (bp_site_sp) {
         // Note, don't assume there's a ThreadPlanStepOverBreakpoint, the
         // target may not require anything special to step over a breakpoint.
@@ -1732,6 +1734,8 @@ std::string Thread::StopReasonAsString(lldb::StopReason reason) {
     return "processor trace";
   case eStopReasonInterrupt:
     return "async interrupt";
+  case eStopReasonHistoryBoundary:
+    return "history boundary";
   }
 
   return "StopReason = " + std::to_string(reason);
diff --git a/lldb/test/API/functionalities/reverse-execution/Makefile b/lldb/test/API/functionalities/reverse-execution/Makefile
new file mode 100644
index 00000000000000..10495940055b63
--- /dev/null
+++ b/lldb/test/API/functionalities/reverse-execution/Makefile
@@ -0,0 +1,3 @@
+C_SOURCES := main.c
+
+include Makefile.rules
diff --git a/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py b/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py
new file mode 100644
index 00000000000000..b37578fbd82468
--- /dev/null
+++ b/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py
@@ -0,0 +1,115 @@
+import lldb
+import time
+import unittest
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test.decorators import *
+from lldbsuite.test.gdbclientutils import *
+from lldbsuite.test.lldbreverse import ReverseTestBase
+from lldbsuite.test import lldbutil
+
+
+class TestReverseContinueBreakpoints(ReverseTestBase):
+    NO_DEBUG_INFO_TESTCASE = True
+
+    def test_reverse_continue(self):
+        self.reverse_continue_internal(async_mode=False)
+
+    def test_reverse_continue_async(self):
+        self.reverse_continue_internal(async_mode=True)
+
+    def reverse_continue_internal(self, async_mode):
+        target, process, initial_threads = self.setup_recording(async_mode)
+
+        # Reverse-continue. We'll stop at the point where we started recording.
+        status = process.Continue(lldb.eRunReverse)
+        self.assertSuccess(status)
+        self.expect_async_state_changes(async_mode, process, [lldb.eStateRunning, lldb.eStateStopped])
+        self.expect(
+            "thread list",
+            STOPPED_DUE_TO_HISTORY_BOUNDARY,
+            substrs=["stopped", "stop reason = history boundary"],
+        )
+
+        # Continue forward normally until the target exits.
+        status = process.Continue()
+        self.expect_async_state_changes(async_mode, process, [lldb.eStateRunning, lldb.eStateExited])
+        self.assertSuccess(status)
+        self.assertState(process.GetState(), lldb.eStateExited)
+        self.assertEqual(process.GetExitStatus(), 0)
+
+    def test_reverse_continue_breakpoint(self):
+        self.reverse_continue_breakpoint_internal(async_mode=False)
+
+    def test_reverse_continue_breakpoint_async(self):
+        self.reverse_continue_breakpoint_internal(async_mode=True)
+
+    def reverse_continue_breakpoint_internal(self, async_mode):
+        target, process, initial_threads = self.setup_recording(async_mode)
+
+        # Reverse-continue to the function "trigger_breakpoint".
+        trigger_bkpt = target.BreakpointCreateByName("trigger_breakpoint", None)
+        status = process.Continue(lldb.eRunReverse)
+        self.assertSuccess(status)
+        self.expect_async_state_changes(async_mode, process, [lldb.eStateRunning, lldb.eStateStopped])
+        threads_now = lldbutil.get_threads_stopped_at_breakpoint(process, trigger_bkpt)
+        self.assertEqual(threads_now, initial_threads)
+
+    def test_reverse_continue_skip_breakpoint(self):
+        self.reverse_continue_skip_breakpoint_internal(async_mode=False)
+
+    def test_reverse_continue_skip_breakpoint_async(self):
+        self.reverse_continue_skip_breakpoint_internal(async_mode=True)
+
+    def reverse_continue_skip_breakpoint_internal(self, async_mode):
+        target, process, initial_threads = self.setup_recording(async_mode)
+
+        # Reverse-continue over a breakpoint at "trigger_breakpoint" whose
+        # condition is false.
+        # This tests that we continue in the correct direction after hitting
+        # the breakpoint.
+        trigger_bkpt = target.BreakpointCreateByName("trigger_breakpoint", None)
+        trigger_bkpt.SetCondition("false_condition")
+        status = process.Continue(lldb.eRunReverse)
+        self.expect_async_state_changes(async_mode, process, [lldb.eStateRunning, lldb.eStateStopped])
+        self.assertSuccess(status)
+        self.expect(
+            "thread list",
+            STOPPED_DUE_TO_HISTORY_BOUNDARY,
+            substrs=["stopped", "stop reason = history boundary"],
+        )
+
+    def setup_recording(self, async_mode):
+        """
+        Record execution of code between "start_recording" and "stop_recording" breakpoints.
+
+        Returns with the target stopped at "stop_recording", with recording disabled,
+        ready to reverse-execute.
+        """
+        self.build()
+        target = self.dbg.CreateTarget("")
+        process = self.connect(target)
+
+        # Record execution from the start of the function "start_recording"
+        # to the start of the function "stop_recording". We want to keep the
+        # interval that we record as small as possible to minimize the run-time
+        # of our single-stepping recorder.
+        start_recording_bkpt = target.BreakpointCreateByName("start_recording", None)
+        initial_threads = lldbutil.continue_to_breakpoint(process, start_recording_bkpt)
+        self.assertEqual(len(initial_threads), 1)
+        target.BreakpointDelete(start_recording_bkpt.GetID())
+        self.start_recording()
+        stop_recording_bkpt = target.BreakpointCreateByName("stop_recording", None)
+        lldbutil.continue_to_breakpoint(process, stop_recording_bkpt)
+        target.BreakpointDelete(stop_recording_bkpt.GetID())
+        self.stop_recording()
+
+        self.dbg.SetAsync(async_mode)
+        self.expect_async_state_changes(async_mode, process, [lldb.eStateStopped])
+
+        return target, process, initial_threads
+
+    def expect_async_state_changes(self, async_mode, process, states):
+        if not async_mode:
+            return
+        listener = self.dbg.GetListener()
+        lldbutil.expect_state_changes(self, listener, process, states)
diff --git a/lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py b/lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py
new file mode 100644
index 00000000000000..d610761b8cb0bc
--- /dev/null
+++ b/lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py
@@ -0,0 +1,30 @@
+import lldb
+import unittest
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test.decorators import *
+from lldbsuite.test import lldbutil
+
+
+class TestReverseContinueNotSupported(TestBase):
+    NO_DEBUG_INFO_TESTCASE = True
+
+    def test_reverse_continue_not_supported(self):
+        self.build()
+        exe = self.getBuildArtifact("a.out")
+        target = self.dbg.CreateTarget(exe)
+        self.assertTrue(target, VALID_TARGET)
+
+        main_bkpt = target.BreakpointCreateByName("main", None)
+        self.assertTrue(main_bkpt, VALID_BREAKPOINT)
+
+        process = target.LaunchSimple(None, None, self.get_process_working_directory())
+        self.assertTrue(process, PROCESS_IS_VALID)
+
+        # This will fail gracefully.
+        status = process.Continue(lldb.eRunReverse)
+        self.assertFailure(status, "target does not support reverse-continue")
+
+        status = process.Continue()
+        self.assertSuccess(status)
+        self.assertState(process.GetState(), lldb.eStateExited)
+        self.assertEqual(process.GetExitStatus(), 0)
diff --git a/lldb/test/API/functionalities/reverse-execution/main.c b/lldb/test/API/functionalities/reverse-execution/main.c
new file mode 100644
index 00000000000000..40e45dc9f5c317
--- /dev/null
+++ b/lldb/test/API/functionalities/reverse-execution/main.c
@@ -0,0 +1,14 @@
+volatile int false_condition = 0;
+
+static void start_recording() {}
+
+static void trigger_breakpoint() {}
+
+static void stop_recording() {}
+
+int main() {
+  start_recording();
+  trigger_breakpoint();
+  stop_recording();
+  return 0;
+}
diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp
index 558f889c4b7f23..211fd34957f496 100644
--- a/lldb/tools/lldb-dap/JSONUtils.cpp
+++ b/lldb/tools/lldb-dap/JSONUtils.cpp
@@ -1045,6 +1045,9 @@ llvm::json::Value CreateThreadStopped(lldb::SBThread &thread,
   case lldb::eStopReasonProcessorTrace:
     body.try_emplace("reason", "processor trace");
     break;
+  case lldb::eStopReasonHistoryBoundary:
+    body.try_emplace("reason", "history boundary");
+    break;
   case lldb::eStopReasonSignal:
   case lldb::eStopReasonException:
     body.try_emplace("reason", "exception");
diff --git a/lldb/tools/lldb-dap/LLDBUtils.cpp b/lldb/tools/lldb-dap/LLDBUtils.cpp
index b38833c0fdb6b6..1c5e3ac7008727 100644
--- a/lldb/tools/lldb-dap/LLDBUtils.cpp
+++ b/lldb/tools/lldb-dap/LLDBUtils.cpp
@@ -111,6 +111,7 @@ bool ThreadHasStopReason(lldb::SBThread &thread) {
   case lldb::eStopReasonVFork:
   case lldb::eStopReasonVForkDone:
   case lldb::eStopReasonInterrupt:
+  case lldb::eStopReasonHistoryBoundary:
     return true;
   case lldb::eStopReasonThreadExiting:
   case lldb::eStopReasonInvalid:

From fae7d6848bbb59fc2bad17adbdb34bd6a11a0651 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Thu, 10 Oct 2024 13:22:56 -0700
Subject: [PATCH 092/177] [lldb] SetErrorStringWithFormatv ->
 FromErrorStringWithFormatv (NFC)

---
 lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp
index 116c43343c01d1..367fce442bb866 100644
--- a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp
+++ b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp
@@ -407,8 +407,9 @@ Status ProcessKDP::DoResume(RunDirection direction) {
   Log *log = GetLog(KDPLog::Process);
 
   if (direction == RunDirection::eRunReverse) {
-    error.SetErrorStringWithFormatv(
-        "error: {0} does not support reverse execution of processes", GetPluginName());
+    error.FromErrorStringWithFormatv(
+        "error: {0} does not support reverse execution of processes",
+        GetPluginName());
     return error;
   }
 

From c686eeb7fcc89673909e7e1f0a0a09a0da269d28 Mon Sep 17 00:00:00 2001
From: Jason Molenda <jmolenda@apple.com>
Date: Thu, 10 Oct 2024 16:07:35 -0700
Subject: [PATCH 093/177] [lldb] skip ReverseContinue tests on Darwin

This uses lldb-server in gdbserver mode, which requires a ProcessNative
plugin.  Darwin does not have a ProcessNative plugin; it uses
debugserver instead of lldb-server.  Skip these tests.
---
 .../reverse-execution/TestReverseContinueBreakpoints.py     | 6 ++++++
 .../reverse-execution/TestReverseContinueNotSupported.py    | 1 +
 2 files changed, 7 insertions(+)

diff --git a/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py b/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py
index b37578fbd82468..8b53d86704f119 100644
--- a/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py
+++ b/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py
@@ -11,9 +11,11 @@
 class TestReverseContinueBreakpoints(ReverseTestBase):
     NO_DEBUG_INFO_TESTCASE = True
 
+    @skipIfDarwin # No Darwin ProcessNative impl for lldb-server
     def test_reverse_continue(self):
         self.reverse_continue_internal(async_mode=False)
 
+    @skipIfDarwin # No Darwin ProcessNative impl for lldb-server
     def test_reverse_continue_async(self):
         self.reverse_continue_internal(async_mode=True)
 
@@ -37,9 +39,11 @@ def reverse_continue_internal(self, async_mode):
         self.assertState(process.GetState(), lldb.eStateExited)
         self.assertEqual(process.GetExitStatus(), 0)
 
+    @skipIfDarwin # No Darwin ProcessNative impl for lldb-server
     def test_reverse_continue_breakpoint(self):
         self.reverse_continue_breakpoint_internal(async_mode=False)
 
+    @skipIfDarwin # No Darwin ProcessNative impl for lldb-server
     def test_reverse_continue_breakpoint_async(self):
         self.reverse_continue_breakpoint_internal(async_mode=True)
 
@@ -54,9 +58,11 @@ def reverse_continue_breakpoint_internal(self, async_mode):
         threads_now = lldbutil.get_threads_stopped_at_breakpoint(process, trigger_bkpt)
         self.assertEqual(threads_now, initial_threads)
 
+    @skipIfDarwin # No Darwin ProcessNative impl for lldb-server
     def test_reverse_continue_skip_breakpoint(self):
         self.reverse_continue_skip_breakpoint_internal(async_mode=False)
 
+    @skipIfDarwin # No Darwin ProcessNative impl for lldb-server
     def test_reverse_continue_skip_breakpoint_async(self):
         self.reverse_continue_skip_breakpoint_internal(async_mode=True)
 
diff --git a/lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py b/lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py
index d610761b8cb0bc..8a20f0ffdcf660 100644
--- a/lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py
+++ b/lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py
@@ -8,6 +8,7 @@
 class TestReverseContinueNotSupported(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
 
+    @skipIfDarwin # No Darwin ProcessNative impl for lldb-server
     def test_reverse_continue_not_supported(self):
         self.build()
         exe = self.getBuildArtifact("a.out")

From 1bf271d5a7de58faf525c3b90ef4a4a8ff47e688 Mon Sep 17 00:00:00 2001
From: William Junda Huang <williamjhuang@google.com>
Date: Thu, 10 Oct 2024 19:10:07 -0400
Subject: [PATCH 094/177] Revert "[ThinLTO] Do not duplicate import a function
 that is actually defined in the current module" (#111919)

Reverts llvm/llvm-project#110064
---
 llvm/lib/Linker/IRMover.cpp                   |  6 +-
 .../Inputs/ditemplatevalueparameter-remap.ll  | 29 -------
 .../X86/ditemplatevalueparameter-remap.ll     | 87 -------------------
 3 files changed, 1 insertion(+), 121 deletions(-)
 delete mode 100644 llvm/test/ThinLTO/X86/Inputs/ditemplatevalueparameter-remap.ll
 delete mode 100644 llvm/test/ThinLTO/X86/ditemplatevalueparameter-remap.ll

diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp
index 5067fbff2e277b..3a6c2678cd157f 100644
--- a/llvm/lib/Linker/IRMover.cpp
+++ b/llvm/lib/Linker/IRMover.cpp
@@ -595,15 +595,11 @@ Value *IRLinker::materialize(Value *V, bool ForIndirectSymbol) {
   if (!SGV)
     return nullptr;
 
-  // If SGV is from dest, it was already materialized when dest was loaded.
-  if (SGV->getParent() == &DstM)
-    return nullptr;
-
   // When linking a global from other modules than source & dest, skip
   // materializing it because it would be mapped later when its containing
   // module is linked. Linking it now would potentially pull in many types that
   // may not be mapped properly.
-  if (SGV->getParent() != SrcM.get())
+  if (SGV->getParent() != &DstM && SGV->getParent() != SrcM.get())
     return nullptr;
 
   Expected<Constant *> NewProto = linkGlobalValueProto(SGV, ForIndirectSymbol);
diff --git a/llvm/test/ThinLTO/X86/Inputs/ditemplatevalueparameter-remap.ll b/llvm/test/ThinLTO/X86/Inputs/ditemplatevalueparameter-remap.ll
deleted file mode 100644
index be93160b943397..00000000000000
--- a/llvm/test/ThinLTO/X86/Inputs/ditemplatevalueparameter-remap.ll
+++ /dev/null
@@ -1,29 +0,0 @@
-target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-define void @_Z8thinlto1v() unnamed_addr {
-  %3 = alloca i64, align 4
-    #dbg_declare(ptr %3, !14, !DIExpression(), !15)
-  ret void
-}
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!2, !3, !4, !5}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
-!1 = !DIFile(filename: "B.cpp", directory: ".")
-!2 = !{i32 7, !"Dwarf Version", i32 4}
-!3 = !{i32 2, !"Debug Info Version", i32 3}
-!4 = !{i32 1, !"wchar_size", i32 4}
-!5 = !{i32 8, !"PIC Level", i32 2}
-!10 = distinct !DISubprogram(name: "thinlto1", linkageName: "_Z8thinlto1v", scope: !11, file: !11, line: 8, type: !12, scopeLine: 8, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
-!11 = !DIFile(filename: "b.cpp", directory: ".")
-!12 = !DISubroutineType(types: !13)
-!13 = !{null}
-!14 = !DILocalVariable(name: "a", arg: 1, scope: !10, file: !11, line: 18, type: !16)
-!15 = !DILocation(line: 18, column: 19, scope: !10)
-!16 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "S<&func1>", file: !11, line: 2, size: 8, flags: DIFlagTypePassByValue, elements: !17, templateParams: !18, identifier: "_ZTS1SIXadL_Z5func1vEEE")
-!17 = !{}
-!18 = !{!19}
-!19 = !DITemplateValueParameter(name: "Func", type: !20, value: ptr undef)
-!20 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !12, size: 64)
diff --git a/llvm/test/ThinLTO/X86/ditemplatevalueparameter-remap.ll b/llvm/test/ThinLTO/X86/ditemplatevalueparameter-remap.ll
deleted file mode 100644
index 0651705ccba8b8..00000000000000
--- a/llvm/test/ThinLTO/X86/ditemplatevalueparameter-remap.ll
+++ /dev/null
@@ -1,87 +0,0 @@
-; https://github.com/llvm/llvm-project/pull/110064
-; This test case checks if thinLTO correctly links metadata values in a specific
-; situation. Assume we are linking module B into module A, where an extern
-; function used in A is defined in B, but the function body has a
-; DITemplateValueParameter referring to another function back in A. The
-; compiler must check this other function is actually coming from A, thus
-; already materialized and does not require remapping. The IR here is modified
-; from the following source code.
-;
-; // A.h
-; template <void (*Func)()>
-; struct S {
-;   void Impl() {
-;     Func();
-;   }
-; };
-;
-; void func1();
-;
-; // A.cpp
-; #include "A.h"
-; __attribute__((weak)) void func1() {}
-; extern void thinlto1();
-; void bar() {
-;   S<func1> s; // Force instantiation of S<func1> in this compilation unit.
-;   s.Impl();
-;   thinlto1();
-; }
-;
-; // B.cpp
-; #include "A.h"
-; void thinlto1() {
-;   S<func1> s;
-; }
-;
-; RUN: opt -module-summary -o %t1.bc %s
-; RUN: opt -module-summary -o %t2.bc %S/Inputs/ditemplatevalueparameter-remap.ll
-; RUN: ld.lld --plugin-opt=thinlto-index-only -shared %t1.bc %t2.bc
-; RUN: clang -O3 -fthinlto-index=%t1.bc.thinlto.bc -x ir %t1.bc -S -emit-llvm -o - | FileCheck %s
-
-target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-$_Z5func1v = comdat any
-
-define linkonce_odr dso_local void @_Z5func1v() unnamed_addr !dbg !10 {
-  ret void
-}
-
-; Dummy function to use _Z5func1v so that it is not treated as dead symbol.
-define void @_Z3bazv() {
-  tail call void @_Z5func1v()
-  ret void
-}
-
-declare void @_Z8thinlto1v() unnamed_addr
-
-; CHECK: void @_Z3barv()
-; CHECK-NOT: call void @_Z8thinlto1v()
-; CHECK-NEXT: ret void
-define void @_Z3barv() unnamed_addr !dbg !14 {
-  tail call void @_Z8thinlto1v(), !dbg !25
-  ret void
-}
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!2, !3, !4, !5}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
-!1 = !DIFile(filename: "A.cpp", directory: ".")
-!2 = !{i32 7, !"Dwarf Version", i32 4}
-!3 = !{i32 2, !"Debug Info Version", i32 3}
-!4 = !{i32 1, !"wchar_size", i32 4}
-!5 = !{i32 8, !"PIC Level", i32 2}
-!10 = distinct !DISubprogram(name: "func1", linkageName: "_Z5func1v", scope: !11, file: !11, line: 6, type: !12, scopeLine: 6, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
-!11 = !DIFile(filename: "a.h", directory: ".")
-!12 = !DISubroutineType(types: !13)
-!13 = !{null}
-!14 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !11, file: !11, line: 15, type: !12, scopeLine: 15, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !16)
-!16 = !{!17}
-!17 = !DILocalVariable(name: "s", scope: !14, file: !11, line: 10, type: !18)
-!18 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "S<&func1>", file: !11, line: 2, size: 8, flags: DIFlagTypePassByValue, elements: !19, templateParams: !20, identifier: "_ZTS1SIXadL_Z5func1vEEE")
-!19 = !{}
-!20 = !{!21}
-!21 = !DITemplateValueParameter(name: "Func", type: !22, value: ptr @_Z5func1v)
-!22 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !12, size: 64)
-!25 = !DILocation(line: 16, column: 5, scope: !14)

From 45cc74357130190b9aef9fab77646c17f2cf2a5e Mon Sep 17 00:00:00 2001
From: vporpo <vporpodas@google.com>
Date: Thu, 10 Oct 2024 16:22:13 -0700
Subject: [PATCH 095/177] [SandboxVec][DAG][NFC] Add comment about duplicate
 notes in deps() (#111915)

---
 .../Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h  | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h
index 7d300ea2b60d2d..5fa57efc1462e8 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h
@@ -122,6 +122,10 @@ class DGNode {
   iterator preds_end(DependencyGraph &DAG) const {
     return const_cast<DGNode *>(this)->preds_end(DAG);
   }
+  /// \Returns a range of DAG predecessors nodes. If this is a MemDGNode then
+  /// this will also include the memory dependency predecessors.
+  /// Please note that this can include the same node more than once, if for
+  /// example it's both a use-def predecessor and a mem dep predecessor.
   iterator_range<iterator> preds(DependencyGraph &DAG) const {
     return make_range(preds_begin(DAG), preds_end(DAG));
   }

From cc20dd285ab72292a1d383d0779aecbe5e1ccf81 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Fri, 11 Oct 2024 10:19:49 +1100
Subject: [PATCH 096/177] [ORC][ELF] Remove the ExecutionSession& argument to
 ELFNixPlatform::Create.

We can get a reference to the ExecutionSession from the ObjectLinkingLayer
argument, so there's no need to pass it in separately.
---
 .../llvm/ExecutionEngine/Orc/ELFNixPlatform.h | 13 +++++-----
 .../ExecutionEngine/Orc/ELFNixPlatform.cpp    | 24 ++++++++++---------
 llvm/lib/ExecutionEngine/Orc/LLJIT.cpp        |  4 ++--
 llvm/tools/llvm-jitlink/llvm-jitlink.cpp      |  4 ++--
 4 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/llvm/include/llvm/ExecutionEngine/Orc/ELFNixPlatform.h b/llvm/include/llvm/ExecutionEngine/Orc/ELFNixPlatform.h
index 40b85e32720108..54442c91096b39 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/ELFNixPlatform.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/ELFNixPlatform.h
@@ -106,14 +106,14 @@ class ELFNixPlatform : public Platform {
   /// RuntimeAliases function, in which case the client is responsible for
   /// setting up all aliases (including the required ones).
   static Expected<std::unique_ptr<ELFNixPlatform>>
-  Create(ExecutionSession &ES, ObjectLinkingLayer &ObjLinkingLayer,
-         JITDylib &PlatformJD, std::unique_ptr<DefinitionGenerator> OrcRuntime,
+  Create(ObjectLinkingLayer &ObjLinkingLayer, JITDylib &PlatformJD,
+         std::unique_ptr<DefinitionGenerator> OrcRuntime,
          std::optional<SymbolAliasMap> RuntimeAliases = std::nullopt);
 
   /// Construct using a path to the ORC runtime.
   static Expected<std::unique_ptr<ELFNixPlatform>>
-  Create(ExecutionSession &ES, ObjectLinkingLayer &ObjLinkingLayer,
-         JITDylib &PlatformJD, const char *OrcRuntimePath,
+  Create(ObjectLinkingLayer &ObjLinkingLayer, JITDylib &PlatformJD,
+         const char *OrcRuntimePath,
          std::optional<SymbolAliasMap> RuntimeAliases = std::nullopt);
 
   ExecutionSession &getExecutionSession() const { return ES; }
@@ -211,8 +211,7 @@ class ELFNixPlatform : public Platform {
 
   static bool supportedTarget(const Triple &TT);
 
-  ELFNixPlatform(ExecutionSession &ES, ObjectLinkingLayer &ObjLinkingLayer,
-                 JITDylib &PlatformJD,
+  ELFNixPlatform(ObjectLinkingLayer &ObjLinkingLayer, JITDylib &PlatformJD,
                  std::unique_ptr<DefinitionGenerator> OrcRuntimeGenerator,
                  Error &Err);
 
@@ -308,4 +307,4 @@ using SPSELFNixJITDylibDepInfoMap =
 } // end namespace orc
 } // end namespace llvm
 
-#endif // LLVM_EXECUTIONENGINE_ORC_ELFNIXPLATFORM_H
\ No newline at end of file
+#endif // LLVM_EXECUTIONENGINE_ORC_ELFNIXPLATFORM_H
diff --git a/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp
index d92077dbcbd034..610ecbff5c5c4d 100644
--- a/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp
@@ -233,10 +233,13 @@ class DSOHandleMaterializationUnit : public MaterializationUnit {
 namespace llvm {
 namespace orc {
 
-Expected<std::unique_ptr<ELFNixPlatform>> ELFNixPlatform::Create(
-    ExecutionSession &ES, ObjectLinkingLayer &ObjLinkingLayer,
-    JITDylib &PlatformJD, std::unique_ptr<DefinitionGenerator> OrcRuntime,
-    std::optional<SymbolAliasMap> RuntimeAliases) {
+Expected<std::unique_ptr<ELFNixPlatform>>
+ELFNixPlatform::Create(ObjectLinkingLayer &ObjLinkingLayer,
+                       JITDylib &PlatformJD,
+                       std::unique_ptr<DefinitionGenerator> OrcRuntime,
+                       std::optional<SymbolAliasMap> RuntimeAliases) {
+
+  auto &ES = ObjLinkingLayer.getExecutionSession();
 
   // If the target is not supported then bail out immediately.
   if (!supportedTarget(ES.getTargetTriple()))
@@ -271,15 +274,14 @@ Expected<std::unique_ptr<ELFNixPlatform>> ELFNixPlatform::Create(
   // Create the instance.
   Error Err = Error::success();
   auto P = std::unique_ptr<ELFNixPlatform>(new ELFNixPlatform(
-      ES, ObjLinkingLayer, PlatformJD, std::move(OrcRuntime), Err));
+      ObjLinkingLayer, PlatformJD, std::move(OrcRuntime), Err));
   if (Err)
     return std::move(Err);
   return std::move(P);
 }
 
 Expected<std::unique_ptr<ELFNixPlatform>>
-ELFNixPlatform::Create(ExecutionSession &ES,
-                       ObjectLinkingLayer &ObjLinkingLayer,
+ELFNixPlatform::Create(ObjectLinkingLayer &ObjLinkingLayer,
                        JITDylib &PlatformJD, const char *OrcRuntimePath,
                        std::optional<SymbolAliasMap> RuntimeAliases) {
 
@@ -289,7 +291,7 @@ ELFNixPlatform::Create(ExecutionSession &ES,
   if (!OrcRuntimeArchiveGenerator)
     return OrcRuntimeArchiveGenerator.takeError();
 
-  return Create(ES, ObjLinkingLayer, PlatformJD,
+  return Create(ObjLinkingLayer, PlatformJD,
                 std::move(*OrcRuntimeArchiveGenerator),
                 std::move(RuntimeAliases));
 }
@@ -392,10 +394,10 @@ bool ELFNixPlatform::supportedTarget(const Triple &TT) {
 }
 
 ELFNixPlatform::ELFNixPlatform(
-    ExecutionSession &ES, ObjectLinkingLayer &ObjLinkingLayer,
-    JITDylib &PlatformJD,
+    ObjectLinkingLayer &ObjLinkingLayer, JITDylib &PlatformJD,
     std::unique_ptr<DefinitionGenerator> OrcRuntimeGenerator, Error &Err)
-    : ES(ES), PlatformJD(PlatformJD), ObjLinkingLayer(ObjLinkingLayer),
+    : ES(ObjLinkingLayer.getExecutionSession()), PlatformJD(PlatformJD),
+      ObjLinkingLayer(ObjLinkingLayer),
       DSOHandleSymbol(ES.intern("__dso_handle")) {
   ErrorAsOutParameter _(&Err);
   ObjLinkingLayer.addPlugin(std::make_unique<ELFNixPlatformPlugin>(*this));
diff --git a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
index a13443ce57ea5c..d3dd3b6bedfb65 100644
--- a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
@@ -1185,8 +1185,8 @@ Expected<JITDylibSP> ExecutorNativePlatform::operator()(LLJIT &J) {
     if (!G)
       return G.takeError();
 
-    if (auto P = ELFNixPlatform::Create(ES, *ObjLinkingLayer, PlatformJD,
-                                        std::move(*G)))
+    if (auto P =
+            ELFNixPlatform::Create(*ObjLinkingLayer, PlatformJD, std::move(*G)))
       J.getExecutionSession().setPlatform(std::move(*P));
     else
       return P.takeError();
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
index a2c05deefa6bfc..108cadd2e0169c 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
@@ -1041,8 +1041,8 @@ Session::Session(std::unique_ptr<ExecutorProcessControl> EPC, Error &Err)
         return;
       }
     } else if (TT.isOSBinFormatELF()) {
-      if (auto P = ELFNixPlatform::Create(ES, ObjLayer, *PlatformJD,
-                                          OrcRuntime.c_str()))
+      if (auto P =
+              ELFNixPlatform::Create(ObjLayer, *PlatformJD, OrcRuntime.c_str()))
         ES.setPlatform(std::move(*P));
       else {
         Err = P.takeError();

From 4f320778148ba481881eb53ba065ed2a9d9bbc03 Mon Sep 17 00:00:00 2001
From: Jason Molenda <jmolenda@apple.com>
Date: Thu, 10 Oct 2024 16:22:07 -0700
Subject: [PATCH 097/177] Revert "[lldb] skip ReverseContinue tests on Darwin"

This reverts commit c686eeb7fcc89673909e7e1f0a0a09a0da269d28.
---
 .../reverse-execution/TestReverseContinueBreakpoints.py     | 6 ------
 .../reverse-execution/TestReverseContinueNotSupported.py    | 1 -
 2 files changed, 7 deletions(-)

diff --git a/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py b/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py
index 8b53d86704f119..b37578fbd82468 100644
--- a/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py
+++ b/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py
@@ -11,11 +11,9 @@
 class TestReverseContinueBreakpoints(ReverseTestBase):
     NO_DEBUG_INFO_TESTCASE = True
 
-    @skipIfDarwin # No Darwin ProcessNative impl for lldb-server
     def test_reverse_continue(self):
         self.reverse_continue_internal(async_mode=False)
 
-    @skipIfDarwin # No Darwin ProcessNative impl for lldb-server
     def test_reverse_continue_async(self):
         self.reverse_continue_internal(async_mode=True)
 
@@ -39,11 +37,9 @@ def reverse_continue_internal(self, async_mode):
         self.assertState(process.GetState(), lldb.eStateExited)
         self.assertEqual(process.GetExitStatus(), 0)
 
-    @skipIfDarwin # No Darwin ProcessNative impl for lldb-server
     def test_reverse_continue_breakpoint(self):
         self.reverse_continue_breakpoint_internal(async_mode=False)
 
-    @skipIfDarwin # No Darwin ProcessNative impl for lldb-server
     def test_reverse_continue_breakpoint_async(self):
         self.reverse_continue_breakpoint_internal(async_mode=True)
 
@@ -58,11 +54,9 @@ def reverse_continue_breakpoint_internal(self, async_mode):
         threads_now = lldbutil.get_threads_stopped_at_breakpoint(process, trigger_bkpt)
         self.assertEqual(threads_now, initial_threads)
 
-    @skipIfDarwin # No Darwin ProcessNative impl for lldb-server
     def test_reverse_continue_skip_breakpoint(self):
         self.reverse_continue_skip_breakpoint_internal(async_mode=False)
 
-    @skipIfDarwin # No Darwin ProcessNative impl for lldb-server
     def test_reverse_continue_skip_breakpoint_async(self):
         self.reverse_continue_skip_breakpoint_internal(async_mode=True)
 
diff --git a/lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py b/lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py
index 8a20f0ffdcf660..d610761b8cb0bc 100644
--- a/lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py
+++ b/lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py
@@ -8,7 +8,6 @@
 class TestReverseContinueNotSupported(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
 
-    @skipIfDarwin # No Darwin ProcessNative impl for lldb-server
     def test_reverse_continue_not_supported(self):
         self.build()
         exe = self.getBuildArtifact("a.out")

From a28e7ce378d717e6aacbdc3089974b93b6b62948 Mon Sep 17 00:00:00 2001
From: Jason Molenda <jmolenda@apple.com>
Date: Thu, 10 Oct 2024 16:22:18 -0700
Subject: [PATCH 098/177] Revert "[lldb] SetErrorStringWithFormatv ->
 FromErrorStringWithFormatv (NFC)"

This reverts commit fae7d6848bbb59fc2bad17adbdb34bd6a11a0651.
---
 lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp
index 367fce442bb866..116c43343c01d1 100644
--- a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp
+++ b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp
@@ -407,9 +407,8 @@ Status ProcessKDP::DoResume(RunDirection direction) {
   Log *log = GetLog(KDPLog::Process);
 
   if (direction == RunDirection::eRunReverse) {
-    error.FromErrorStringWithFormatv(
-        "error: {0} does not support reverse execution of processes",
-        GetPluginName());
+    error.SetErrorStringWithFormatv(
+        "error: {0} does not support reverse execution of processes", GetPluginName());
     return error;
   }
 

From 3bef742559f1556569423ec63c70b97dff1d426e Mon Sep 17 00:00:00 2001
From: Jason Molenda <jmolenda@apple.com>
Date: Thu, 10 Oct 2024 16:22:24 -0700
Subject: [PATCH 099/177] Revert "[lldb] Implement basic support for
 reverse-continue (#99736)"

Reverting this again; I added a commit which added @skipIfDarwin
markers to the TestReverseContinueBreakpoints.py and
TestReverseContinueNotSupported.py API tests, which use lldb-server
in gdbserver mode which does not work on Darwin.  But the aarch64 ubuntu
bot reported a failure on TestReverseContinueBreakpoints.py,
https://lab.llvm.org/buildbot/#/builders/59/builds/6397

  File "/home/tcwg-buildbot/worker/lldb-aarch64-ubuntu/llvm-project/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py", line 63, in test_reverse_continue_skip_breakpoint
    self.reverse_continue_skip_breakpoint_internal(async_mode=False)
  File "/home/tcwg-buildbot/worker/lldb-aarch64-ubuntu/llvm-project/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py", line 81, in reverse_continue_skip_breakpoint_internal
    self.expect(
  File "/home/tcwg-buildbot/worker/lldb-aarch64-ubuntu/llvm-project/lldb/packages/Python/lldbsuite/test/lldbtest.py", line 2372, in expect
    self.runCmd(
  File "/home/tcwg-buildbot/worker/lldb-aarch64-ubuntu/llvm-project/lldb/packages/Python/lldbsuite/test/lldbtest.py", line 1002, in runCmd
    self.assertTrue(self.res.Succeeded(), msg + output)
AssertionError: False is not true : Process should be stopped due to history boundary
Error output:
error: Process must be launched.

This reverts commit 4f297566b3150097de26c6a23a987d2bd5fc19c5.
---
 lldb/include/lldb/API/SBProcess.h             |   1 -
 lldb/include/lldb/Target/Process.h            |  21 +-
 lldb/include/lldb/Target/StopInfo.h           |   6 -
 lldb/include/lldb/lldb-enumerations.h         |   6 -
 .../Python/lldbsuite/test/gdbclientutils.py   |   5 +-
 .../Python/lldbsuite/test/lldbgdbproxy.py     | 175 --------
 .../Python/lldbsuite/test/lldbreverse.py      | 418 ------------------
 .../Python/lldbsuite/test/lldbtest.py         |   2 -
 lldb/source/API/SBProcess.cpp                 |   8 +-
 lldb/source/API/SBThread.cpp                  |   2 -
 .../source/Interpreter/CommandInterpreter.cpp |   3 +-
 .../Process/Linux/NativeThreadLinux.cpp       |   3 -
 .../Process/MacOSX-Kernel/ProcessKDP.cpp      |   9 +-
 .../Process/MacOSX-Kernel/ProcessKDP.h        |   2 +-
 .../Process/Windows/Common/ProcessWindows.cpp |   8 +-
 .../Process/Windows/Common/ProcessWindows.h   |   2 +-
 .../GDBRemoteCommunicationClient.cpp          |  22 -
 .../gdb-remote/GDBRemoteCommunicationClient.h |   6 -
 .../GDBRemoteCommunicationServerLLGS.cpp      |   1 -
 .../Process/gdb-remote/ProcessGDBRemote.cpp   |  77 +---
 .../Process/gdb-remote/ProcessGDBRemote.h     |   2 +-
 .../Process/scripted/ScriptedProcess.cpp      |   9 +-
 .../Process/scripted/ScriptedProcess.h        |   2 +-
 lldb/source/Target/Process.cpp                |  29 +-
 lldb/source/Target/StopInfo.cpp               |  29 --
 lldb/source/Target/Thread.cpp                 |   8 +-
 .../reverse-execution/Makefile                |   3 -
 .../TestReverseContinueBreakpoints.py         | 115 -----
 .../TestReverseContinueNotSupported.py        |  30 --
 .../functionalities/reverse-execution/main.c  |  14 -
 lldb/tools/lldb-dap/JSONUtils.cpp             |   3 -
 lldb/tools/lldb-dap/LLDBUtils.cpp             |   1 -
 32 files changed, 44 insertions(+), 978 deletions(-)
 delete mode 100644 lldb/packages/Python/lldbsuite/test/lldbgdbproxy.py
 delete mode 100644 lldb/packages/Python/lldbsuite/test/lldbreverse.py
 delete mode 100644 lldb/test/API/functionalities/reverse-execution/Makefile
 delete mode 100644 lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py
 delete mode 100644 lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py
 delete mode 100644 lldb/test/API/functionalities/reverse-execution/main.c

diff --git a/lldb/include/lldb/API/SBProcess.h b/lldb/include/lldb/API/SBProcess.h
index 8b8ed830b54cc0..1624e02070b1b2 100644
--- a/lldb/include/lldb/API/SBProcess.h
+++ b/lldb/include/lldb/API/SBProcess.h
@@ -159,7 +159,6 @@ class LLDB_API SBProcess {
   lldb::SBError Destroy();
 
   lldb::SBError Continue();
-  lldb::SBError Continue(RunDirection direction);
 
   lldb::SBError Stop();
 
diff --git a/lldb/include/lldb/Target/Process.h b/lldb/include/lldb/Target/Process.h
index fe7fbc50fd5770..b8c53a474ba6b9 100644
--- a/lldb/include/lldb/Target/Process.h
+++ b/lldb/include/lldb/Target/Process.h
@@ -857,10 +857,10 @@ class Process : public std::enable_shared_from_this<Process>,
   /// \see Thread:Resume()
   /// \see Thread:Step()
   /// \see Thread:Suspend()
-  Status Resume(lldb::RunDirection direction = lldb::eRunForward);
+  Status Resume();
 
   /// Resume a process, and wait for it to stop.
-  Status ResumeSynchronous(Stream *stream, lldb::RunDirection direction = lldb::eRunForward);
+  Status ResumeSynchronous(Stream *stream);
 
   /// Halts a running process.
   ///
@@ -1104,14 +1104,9 @@ class Process : public std::enable_shared_from_this<Process>,
   /// \see Thread:Resume()
   /// \see Thread:Step()
   /// \see Thread:Suspend()
-  virtual Status DoResume(lldb::RunDirection direction) {
-    if (direction == lldb::RunDirection::eRunForward) {
-      return Status::FromErrorStringWithFormatv(
-          "error: {0} does not support resuming processes", GetPluginName());
-    } else {
-      return Status::FromErrorStringWithFormatv(
-          "error: {0} does not support reverse execution of processes", GetPluginName());
-    }
+  virtual Status DoResume() {
+    return Status::FromErrorStringWithFormatv(
+        "error: {0} does not support resuming processes", GetPluginName());
   }
 
   /// Called after resuming a process.
@@ -2337,8 +2332,6 @@ class Process : public std::enable_shared_from_this<Process>,
 
   bool IsRunning() const;
 
-  lldb::RunDirection GetLastRunDirection() { return m_last_run_direction; }
-
   DynamicCheckerFunctions *GetDynamicCheckers() {
     return m_dynamic_checkers_up.get();
   }
@@ -2858,7 +2851,7 @@ void PruneThreadPlans();
   ///
   /// \return
   ///     An Status object describing the success or failure of the resume.
-  Status PrivateResume(lldb::RunDirection direction = lldb::eRunForward);
+  Status PrivateResume();
 
   // Called internally
   void CompleteAttach();
@@ -3134,8 +3127,6 @@ void PruneThreadPlans();
                            // m_currently_handling_do_on_removals are true,
                            // Resume will only request a resume, using this
                            // flag to check.
-  // The direction of execution from the last time this process was resumed.
-  lldb::RunDirection m_last_run_direction;
 
   lldb::tid_t m_interrupt_tid; /// The tid of the thread that issued the async
                                /// interrupt, used by thread plan timeout. It
diff --git a/lldb/include/lldb/Target/StopInfo.h b/lldb/include/lldb/Target/StopInfo.h
index 072f71f6b1122f..fae90364deaf0a 100644
--- a/lldb/include/lldb/Target/StopInfo.h
+++ b/lldb/include/lldb/Target/StopInfo.h
@@ -142,12 +142,6 @@ class StopInfo : public std::enable_shared_from_this<StopInfo> {
   static lldb::StopInfoSP
   CreateStopReasonProcessorTrace(Thread &thread, const char *description);
 
-  // This creates a StopInfo indicating that execution stopped because
-  // it was replaying some recorded execution history, and execution reached
-  // the end of that recorded history.
-  static lldb::StopInfoSP
-  CreateStopReasonHistoryBoundary(Thread &thread, const char *description);
-
   static lldb::StopInfoSP CreateStopReasonFork(Thread &thread,
                                                lldb::pid_t child_pid,
                                                lldb::tid_t child_tid);
diff --git a/lldb/include/lldb/lldb-enumerations.h b/lldb/include/lldb/lldb-enumerations.h
index 232d1dfdb5c9d0..938f6e3abe8f2a 100644
--- a/lldb/include/lldb/lldb-enumerations.h
+++ b/lldb/include/lldb/lldb-enumerations.h
@@ -135,9 +135,6 @@ FLAGS_ENUM(LaunchFlags){
 /// Thread Run Modes.
 enum RunMode { eOnlyThisThread, eAllThreads, eOnlyDuringStepping };
 
-/// Execution directions
-enum RunDirection { eRunForward, eRunReverse };
-
 /// Byte ordering definitions.
 enum ByteOrder {
   eByteOrderInvalid = 0,
@@ -257,9 +254,6 @@ enum StopReason {
   eStopReasonVFork,
   eStopReasonVForkDone,
   eStopReasonInterrupt, ///< Thread requested interrupt
-  // Indicates that execution stopped because the debugger backend relies
-  // on recorded data and we reached the end of that data.
-  eStopReasonHistoryBoundary,
 };
 
 /// Command Return Status Types.
diff --git a/lldb/packages/Python/lldbsuite/test/gdbclientutils.py b/lldb/packages/Python/lldbsuite/test/gdbclientutils.py
index 732d6171320680..1784487323ad6b 100644
--- a/lldb/packages/Python/lldbsuite/test/gdbclientutils.py
+++ b/lldb/packages/Python/lldbsuite/test/gdbclientutils.py
@@ -510,9 +510,8 @@ def start(self):
         self._thread.start()
 
     def stop(self):
-        if self._thread is not None:
-            self._thread.join()
-            self._thread = None
+        self._thread.join()
+        self._thread = None
 
     def get_connect_address(self):
         return self._socket.get_connect_address()
diff --git a/lldb/packages/Python/lldbsuite/test/lldbgdbproxy.py b/lldb/packages/Python/lldbsuite/test/lldbgdbproxy.py
deleted file mode 100644
index 2a9592bf4545a4..00000000000000
--- a/lldb/packages/Python/lldbsuite/test/lldbgdbproxy.py
+++ /dev/null
@@ -1,175 +0,0 @@
-import logging
-import os
-import os.path
-import random
-
-import lldb
-from lldbsuite.test.lldbtest import *
-from lldbsuite.test.gdbclientutils import *
-import lldbgdbserverutils
-from lldbsuite.support import seven
-
-
-class GDBProxyTestBase(TestBase):
-    """
-    Base class for gdbserver proxy tests.
-
-    This class will setup and start a mock GDB server for the test to use.
-    It pases through requests to a regular lldb-server/debugserver and
-    forwards replies back to the LLDB under test.
-    """
-
-    """The gdbserver that we implement."""
-    server = None
-    """The inner lldb-server/debugserver process that we proxy requests into."""
-    monitor_server = None
-    monitor_sock = None
-
-    server_socket_class = TCPServerSocket
-
-    DEFAULT_TIMEOUT = 20 * (10 if ("ASAN_OPTIONS" in os.environ) else 1)
-
-    _verbose_log_handler = None
-    _log_formatter = logging.Formatter(fmt="%(asctime)-15s %(levelname)-8s %(message)s")
-
-    def setUpBaseLogging(self):
-        self.logger = logging.getLogger(__name__)
-
-        if len(self.logger.handlers) > 0:
-            return  # We have set up this handler already
-
-        self.logger.propagate = False
-        self.logger.setLevel(logging.DEBUG)
-
-        # log all warnings to stderr
-        handler = logging.StreamHandler()
-        handler.setLevel(logging.WARNING)
-        handler.setFormatter(self._log_formatter)
-        self.logger.addHandler(handler)
-
-    def setUp(self):
-        TestBase.setUp(self)
-
-        self.setUpBaseLogging()
-
-        if self.isVerboseLoggingRequested():
-            # If requested, full logs go to a log file
-            log_file_name = self.getLogBasenameForCurrentTest() + "-proxy.log"
-            self._verbose_log_handler = logging.FileHandler(
-               log_file_name
-            )
-            self._verbose_log_handler.setFormatter(self._log_formatter)
-            self._verbose_log_handler.setLevel(logging.DEBUG)
-            self.logger.addHandler(self._verbose_log_handler)
-
-        lldb_server_exe = lldbgdbserverutils.get_lldb_server_exe()
-        if lldb_server_exe is None:
-            self.debug_monitor_exe = lldbgdbserverutils.get_debugserver_exe()
-            self.assertTrue(self.debug_monitor_exe is not None)
-            self.debug_monitor_extra_args = []
-        else:
-            self.debug_monitor_exe = lldb_server_exe
-            self.debug_monitor_extra_args = ["gdbserver"]
-
-        self.server = MockGDBServer(self.server_socket_class())
-        self.server.responder = self
-
-    def tearDown(self):
-        # TestBase.tearDown will kill the process, but we need to kill it early
-        # so its client connection closes and we can stop the server before
-        # finally calling the base tearDown.
-        if self.process() is not None:
-            self.process().Kill()
-        self.server.stop()
-
-        self.logger.removeHandler(self._verbose_log_handler)
-        self._verbose_log_handler = None
-
-        TestBase.tearDown(self)
-
-    def isVerboseLoggingRequested(self):
-        # We will report our detailed logs if the user requested that the "gdb-remote" channel is
-        # logged.
-        return any(("gdb-remote" in channel) for channel in lldbtest_config.channels)
-
-    def connect(self, target):
-        """
-        Create a process by connecting to the mock GDB server.
-        """
-        self.prep_debug_monitor_and_inferior()
-        self.server.start()
-
-        listener = self.dbg.GetListener()
-        error = lldb.SBError()
-        process = target.ConnectRemote(
-            listener, self.server.get_connect_url(), "gdb-remote", error
-        )
-        self.assertTrue(error.Success(), error.description)
-        self.assertTrue(process, PROCESS_IS_VALID)
-        return process
-
-    def get_next_port(self):
-        return 12000 + random.randint(0, 3999)
-
-    def prep_debug_monitor_and_inferior(self):
-        inferior_exe_path = self.getBuildArtifact("a.out")
-        self.connect_to_debug_monitor([inferior_exe_path])
-        self.assertIsNotNone(self.monitor_server)
-        self.initial_handshake()
-
-    def initial_handshake(self):
-        self.monitor_server.send_packet(seven.bitcast_to_bytes("+"))
-        reply = seven.bitcast_to_string(self.monitor_server.get_normal_packet())
-        self.assertEqual(reply, "+")
-        self.monitor_server.send_packet(seven.bitcast_to_bytes("QStartNoAckMode"))
-        reply = seven.bitcast_to_string(self.monitor_server.get_normal_packet())
-        self.assertEqual(reply, "+")
-        reply = seven.bitcast_to_string(self.monitor_server.get_normal_packet())
-        self.assertEqual(reply, "OK")
-        self.monitor_server.send_packet(seven.bitcast_to_bytes("+"))
-        reply = seven.bitcast_to_string(self.monitor_server.get_normal_packet())
-        self.assertEqual(reply, "+")
-
-    def get_debug_monitor_command_line_args(self, connect_address, launch_args):
-        return self.debug_monitor_extra_args + ["--reverse-connect", connect_address] + launch_args
-
-    def launch_debug_monitor(self, launch_args):
-        family, type, proto, _, addr = socket.getaddrinfo(
-            "localhost", 0, proto=socket.IPPROTO_TCP
-        )[0]
-        sock = socket.socket(family, type, proto)
-        sock.settimeout(self.DEFAULT_TIMEOUT)
-        sock.bind(addr)
-        sock.listen(1)
-        addr = sock.getsockname()
-        connect_address = "[{}]:{}".format(*addr)
-
-        commandline_args = self.get_debug_monitor_command_line_args(
-            connect_address, launch_args
-        )
-
-        # Start the server.
-        self.logger.info(f"Spawning monitor {commandline_args}")
-        monitor_process = self.spawnSubprocess(
-            self.debug_monitor_exe, commandline_args, install_remote=False
-        )
-        self.assertIsNotNone(monitor_process)
-
-        self.monitor_sock = sock.accept()[0]
-        self.monitor_sock.settimeout(self.DEFAULT_TIMEOUT)
-        return monitor_process
-
-    def connect_to_debug_monitor(self, launch_args):
-        monitor_process = self.launch_debug_monitor(launch_args)
-        self.monitor_server = lldbgdbserverutils.Server(self.monitor_sock, monitor_process)
-
-    def respond(self, packet):
-        """Subclasses can override this to change how packets are handled."""
-        return self.pass_through(packet)
-
-    def pass_through(self, packet):
-        self.logger.info(f"Sending packet {packet}")
-        self.monitor_server.send_packet(seven.bitcast_to_bytes(packet))
-        reply = seven.bitcast_to_string(self.monitor_server.get_normal_packet())
-        self.logger.info(f"Received reply {reply}")
-        return reply
diff --git a/lldb/packages/Python/lldbsuite/test/lldbreverse.py b/lldb/packages/Python/lldbsuite/test/lldbreverse.py
deleted file mode 100644
index 0f02fdffbdeada..00000000000000
--- a/lldb/packages/Python/lldbsuite/test/lldbreverse.py
+++ /dev/null
@@ -1,418 +0,0 @@
-import os
-import os.path
-import lldb
-from lldbsuite.test.lldbtest import *
-from lldbsuite.test.gdbclientutils import *
-from lldbsuite.test.lldbgdbproxy import *
-import lldbgdbserverutils
-import re
-
-
-class ThreadSnapshot:
-    def __init__(self, thread_id, registers):
-        self.thread_id = thread_id
-        self.registers = registers
-
-
-class MemoryBlockSnapshot:
-    def __init__(self, address, data):
-        self.address = address
-        self.data = data
-
-
-class StateSnapshot:
-    def __init__(self, thread_snapshots, memory):
-        self.thread_snapshots = thread_snapshots
-        self.memory = memory
-        self.thread_id = None
-
-
-class RegisterInfo:
-    def __init__(self, lldb_index, bitsize, little_endian):
-        self.lldb_index = lldb_index
-        self.bitsize = bitsize
-        self.little_endian = little_endian
-
-
-BELOW_STACK_POINTER = 16384
-ABOVE_STACK_POINTER = 4096
-
-BLOCK_SIZE = 1024
-
-SOFTWARE_BREAKPOINTS = 0
-HARDWARE_BREAKPOINTS = 1
-WRITE_WATCHPOINTS = 2
-
-
-class ReverseTestBase(GDBProxyTestBase):
-    """
-    Base class for tests that need reverse execution.
-
-    This class uses a gdbserver proxy to add very limited reverse-
-    execution capability to lldb-server/debugserver for testing
-    purposes only.
-
-    To use this class, run the inferior forward until some stopping point.
-    Then call `start_recording()` and execute forward again until reaching
-    a software breakpoint; this class records the state before each execution executes.
-    At that point, the server will accept "bc" and "bs" packets to step
-    backwards through the state.
-    When executing during recording, we only allow single-step and continue without
-    delivering a signal, and only software breakpoint stops are allowed.
-
-    We assume that while recording is enabled, the only effects of instructions
-    are on general-purpose registers (read/written by the 'g' and 'G' packets)
-    and on memory bytes between [SP - BELOW_STACK_POINTER, SP + ABOVE_STACK_POINTER).
-    """
-
-    """
-    A list of StateSnapshots in time order.
-
-    There is one snapshot per single-stepped instruction,
-    representing the state before that instruction was
-    executed. The last snapshot in the list is the
-    snapshot before the last instruction was executed.
-    This is an undo log; we snapshot a superset of the state that may have
-    been changed by the instruction's execution.
-    """
-    snapshots = None
-    recording_enabled = False
-
-    breakpoints = None
-
-    pid = None
-
-    pc_register_info = None
-    sp_register_info = None
-    general_purpose_register_info = None
-
-    def __init__(self, *args, **kwargs):
-        GDBProxyTestBase.__init__(self, *args, **kwargs)
-        self.breakpoints = [set(), set(), set(), set(), set()]
-
-    def respond(self, packet):
-        if not packet:
-            raise ValueError("Invalid empty packet")
-        if packet == self.server.PACKET_INTERRUPT:
-            # Don't send a response. We'll just run to completion.
-            return []
-        if self.is_command(packet, "qSupported", ":"):
-            reply = self.pass_through(packet)
-            return reply + ";ReverseStep+;ReverseContinue+"
-        if self.is_command(packet, "vCont", ";"):
-            if self.recording_enabled:
-                return self.continue_with_recording(packet)
-            snapshots = []
-        if packet[0] == "c" or packet[0] == "s" or packet[0] == "C" or packet[0] == "S":
-            raise ValueError("LLDB should not be sending old-style continuation packets")
-        if packet == "bc":
-            return self.reverse_continue()
-        if packet == "bs":
-            return self.reverse_step()
-        if packet == 'jThreadsInfo':
-            # Suppress this because it contains thread stop reasons which we might
-            # need to modify, and we don't want to have to implement that.
-            return ""
-        if packet[0] == "z" or packet[0] == "Z":
-            reply = self.pass_through(packet)
-            if reply == "OK":
-                self.update_breakpoints(packet)
-            return reply
-        return GDBProxyTestBase.respond(self, packet)
-
-    def start_recording(self):
-        self.recording_enabled = True
-        self.snapshots = []
-
-    def stop_recording(self):
-        """
-        Don't record when executing foward.
-
-        Reverse execution is still supported until the next forward continue.
-        """
-        self.recording_enabled = False
-
-    def is_command(self, packet, cmd, follow_token):
-        return packet == cmd or packet[0:len(cmd) + 1] == cmd + follow_token
-
-    def update_breakpoints(self, packet):
-        m = re.match("([zZ])([01234]),([0-9a-f]+),([0-9a-f]+)", packet)
-        if m is None:
-            raise ValueError("Invalid breakpoint packet: " + packet)
-        t = int(m.group(2))
-        addr = int(m.group(3), 16)
-        kind = int(m.group(4), 16)
-        if m.group(1) == 'Z':
-            self.breakpoints[t].add((addr, kind))
-        else:
-            self.breakpoints[t].discard((addr, kind))
-
-    def breakpoint_triggered_at(self, pc):
-        if any(addr == pc for addr, kind in self.breakpoints[SOFTWARE_BREAKPOINTS]):
-            return True
-        if any(addr == pc for addr, kind in self.breakpoints[HARDWARE_BREAKPOINTS]):
-            return True
-        return False
-
-    def watchpoint_triggered(self, new_value_block, current_contents):
-        """Returns the address or None."""
-        for watch_addr, kind in breakpoints[WRITE_WATCHPOINTS]:
-            for offset in range(0, kind):
-                addr = watch_addr + offset
-                if (addr >= new_value_block.address and
-                    addr < new_value_block.address + len(new_value_block.data)):
-                    index = addr - new_value_block.address
-                    if new_value_block.data[index*2:(index + 1)*2] != current_contents[index*2:(index + 1)*2]:
-                        return watch_addr
-        return None
-
-    def continue_with_recording(self, packet):
-        self.logger.debug("Continue with recording enabled")
-
-        step_packet = "vCont;s"
-        if packet == "vCont":
-            requested_step = False
-        else:
-            m = re.match("vCont;(c|s)(.*)", packet)
-            if m is None:
-                raise ValueError("Unsupported vCont packet: " + packet)
-            requested_step = m.group(1) == 's'
-            step_packet += m.group(2)
-
-        while True:
-            snapshot = self.capture_snapshot()
-            reply = self.pass_through(step_packet)
-            (stop_signal, stop_pairs) = self.parse_stop(reply)
-            if stop_signal != 5:
-                raise ValueError("Unexpected stop signal: " + reply)
-            is_swbreak = False
-            thread_id = None
-            for key, value in stop_pairs.items():
-                if key == "thread":
-                    thread_id = self.parse_thread_id(value)
-                    continue
-                if re.match('[0-9a-f]+', key):
-                    continue
-                if key == "swbreak" or (key == "reason" and value == "breakpoint"):
-                    is_swbreak = True
-                    continue
-                if key in ["name", "threads", "thread-pcs", "reason"]:
-                    continue
-                raise ValueError(f"Unknown stop key '{key}' in {reply}")
-            if is_swbreak:
-                self.logger.debug("Recording stopped")
-                return reply
-            if thread_id is None:
-                return ValueError("Expected thread ID: " + reply)
-            snapshot.thread_id = thread_id
-            self.snapshots.append(snapshot)
-            if requested_step:
-                self.logger.debug("Recording stopped for step")
-                return reply
-
-    def parse_stop(self, reply):
-        result = {}
-        if not reply:
-            raise ValueError("Invalid empty packet")
-        if reply[0] == "T" and len(reply) >= 3:
-            result = {k:v for k, v in self.parse_pairs(reply[3:])}
-            return (int(reply[1:3], 16), result)
-        raise "Unsupported stop reply: " + reply
-
-    def parse_pairs(self, text):
-        for pair in text.split(";"):
-            if not pair:
-                continue
-            m = re.match("([^:]+):(.*)", pair)
-            if m is None:
-                raise ValueError("Invalid pair text: " + text)
-            yield (m.group(1), m.group(2))
-
-    def capture_snapshot(self):
-        """Snapshot all threads and their stack memories."""
-        self.ensure_register_info()
-        current_thread = self.get_current_thread()
-        thread_snapshots = []
-        memory = []
-        for thread_id in self.get_thread_list():
-            registers = {}
-            for index in sorted(self.general_purpose_register_info.keys()):
-                reply =  self.pass_through(f"p{index:x};thread:{thread_id:x};")
-                if reply == "" or reply[0] == 'E':
-                    raise ValueError("Can't read register")
-                registers[index] = reply
-            thread_snapshot = ThreadSnapshot(thread_id, registers)
-            thread_sp = self.get_register(self.sp_register_info, thread_snapshot.registers)
-            memory += self.read_memory(thread_sp - BELOW_STACK_POINTER, thread_sp + ABOVE_STACK_POINTER)
-            thread_snapshots.append(thread_snapshot)
-        self.set_current_thread(current_thread)
-        return StateSnapshot(thread_snapshots, memory)
-
-    def restore_snapshot(self, snapshot):
-        """
-        Restore the snapshot during reverse execution.
-
-        If this triggers a breakpoint or watchpoint, return the stop reply,
-        otherwise None.
-        """
-        current_thread = self.get_current_thread()
-        stop_reasons = []
-        for thread_snapshot in snapshot.thread_snapshots:
-            thread_id = thread_snapshot.thread_id
-            for lldb_index in sorted(thread_snapshot.registers.keys()):
-                data = thread_snapshot.registers[lldb_index]
-                reply = self.pass_through(f"P{lldb_index:x}={data};thread:{thread_id:x};")
-                if reply != "OK":
-                    raise ValueError("Can't restore thread register")
-            if thread_id == snapshot.thread_id:
-                new_pc = self.get_register(self.pc_register_info, thread_snapshot.registers)
-                if self.breakpoint_triggered_at(new_pc):
-                    stop_reasons.append([("reason", "breakpoint")])
-        self.set_current_thread(current_thread)
-        for block in snapshot.memory:
-            current_memory = self.pass_through(f"m{block.address:x},{(len(block.data)/2):x}")
-            if not current_memory or current_memory[0] == 'E':
-                raise ValueError("Can't read back memory")
-            reply = self.pass_through(f"M{block.address:x},{len(block.data)/2:x}:" + block.data)
-            if reply != "OK":
-                raise ValueError("Can't restore memory")
-            watch_addr = self.watchpoint_triggered(block, current_memory[1:])
-            if watch_addr is not None:
-                stop_reasons.append([("reason", "watchpoint"), ("watch", f"{watch_addr:x}")])
-        if stop_reasons:
-            pairs = ";".join(f"{key}:{value}" for key, value in stop_reasons[0])
-            return f"T05thread:{self.pid:x}.{snapshot.thread_id:x};{pairs};"
-        return None
-
-    def reverse_step(self):
-        if not self.snapshots:
-            self.logger.debug("Reverse-step at history boundary")
-            return self.history_boundary_reply(self.get_current_thread())
-        self.logger.debug("Reverse-step started")
-        snapshot = self.snapshots.pop()
-        stop_reply = self.restore_snapshot(snapshot)
-        self.set_current_thread(snapshot.thread_id)
-        self.logger.debug("Reverse-step stopped")
-        if stop_reply is None:
-            return self.singlestep_stop_reply(snapshot.thread_id)
-        return stop_reply
-
-    def reverse_continue(self):
-        self.logger.debug("Reverse-continue started")
-        thread_id = None
-        while self.snapshots:
-            snapshot = self.snapshots.pop()
-            stop_reply = self.restore_snapshot(snapshot)
-            thread_id = snapshot.thread_id
-            if stop_reply is not None:
-                self.set_current_thread(thread_id)
-                self.logger.debug("Reverse-continue stopped")
-                return stop_reply
-        if thread_id is None:
-            thread_id = self.get_current_thread()
-        else:
-            self.set_current_thread(snapshot.thread_id)
-        self.logger.debug("Reverse-continue stopped at history boundary")
-        return self.history_boundary_reply(thread_id)
-
-    def get_current_thread(self):
-        reply = self.pass_through("qC")
-        return self.parse_thread_id(reply[2:])
-
-    def parse_thread_id(self, thread_id):
-        m = re.match("(p([0-9a-f]+)[.])?([0-9a-f]+)$", thread_id)
-        if m is None:
-            raise ValueError("Invalid thread ID: " + thread_id)
-        if self.pid is None:
-            self.pid = int(m.group(2), 16)
-        return int(m.group(3), 16)
-
-    def history_boundary_reply(self, thread_id):
-        return f"T00thread:{self.pid:x}.{thread_id:x};replaylog:begin;"
-
-    def singlestep_stop_reply(self, thread_id):
-        return f"T05thread:{self.pid:x}.{thread_id:x};"
-
-    def set_current_thread(self, thread_id):
-        """
-        Set current thread in inner gdbserver.
-        """
-        if thread_id >= 0:
-            self.pass_through(f"Hg{self.pid:x}.{thread_id:x}")
-            self.pass_through(f"Hc{self.pid:x}.{thread_id:x}")
-        else:
-            self.pass_through(f"Hc-1.-1")
-            self.pass_through(f"Hg-1.-1")
-
-    def get_register(self, register_info, registers):
-        if register_info.bitsize % 8 != 0:
-            raise ValueError("Register size must be a multiple of 8 bits")
-        if register_info.lldb_index not in registers:
-            raise ValueError("Register value not captured")
-        data = registers[register_info.lldb_index]
-        num_bytes = register_info.bitsize//8
-        bytes = []
-        for i in range(0, num_bytes):
-            bytes.append(int(data[i*2:(i + 1)*2], 16))
-        if register_info.little_endian:
-            bytes.reverse()
-        result = 0
-        for byte in bytes:
-            result = (result << 8) + byte
-        return result
-
-    def read_memory(self, start_addr, end_addr):
-        """
-        Read a region of memory from the target.
-
-        Some of the addresses may extend into invalid virtual memory;
-        skip those areas.
-        Return a list of blocks containing the valid area(s) in the
-        requested range.
-        """
-        regions = []
-        start_addr = start_addr & (BLOCK_SIZE - 1)
-        end_addr = (end_addr + BLOCK_SIZE - 1) & (BLOCK_SIZE - 1)
-        for addr in range(start_addr, end_addr, BLOCK_SIZE):
-            reply = self.pass_through(f"m{addr:x},{(BLOCK_SIZE - 1):x}")
-            if reply and reply[0] != 'E':
-                block = MemoryBlockSnapshot(addr, reply[1:])
-                regions.append(block)
-        return regions
-
-    def ensure_register_info(self):
-        if self.general_purpose_register_info is not None:
-            return
-        reply = self.pass_through("qHostInfo")
-        little_endian = any(kv == ("endian", "little") for kv in self.parse_pairs(reply))
-        self.general_purpose_register_info = {}
-        lldb_index = 0
-        while True:
-            reply = self.pass_through(f"qRegisterInfo{lldb_index:x}")
-            if not reply or reply[0] == 'E':
-                break
-            info = {k:v for k, v in self.parse_pairs(reply)}
-            reg_info = RegisterInfo(lldb_index, int(info["bitsize"]), little_endian)
-            if info["set"] == "General Purpose Registers" and not "container-regs" in info:
-                self.general_purpose_register_info[lldb_index] = reg_info
-            if "generic" in info:
-                if info["generic"] == "pc":
-                    self.pc_register_info = reg_info
-                elif info["generic"] == "sp":
-                    self.sp_register_info = reg_info
-            lldb_index += 1
-        if self.pc_register_info is None or self.sp_register_info is None:
-            raise ValueError("Can't find generic pc or sp register")
-
-    def get_thread_list(self):
-        threads = []
-        reply = self.pass_through("qfThreadInfo")
-        while True:
-            if not reply:
-                raise ValueError("Missing reply packet")
-            if reply[0] == 'm':
-                for id in reply[1:].split(","):
-                    threads.append(self.parse_thread_id(id))
-            elif reply[0] == 'l':
-                return threads
-            reply = self.pass_through("qsThreadInfo")
diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py
index 7cc1ac9749ec93..8884ef5933ada8 100644
--- a/lldb/packages/Python/lldbsuite/test/lldbtest.py
+++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py
@@ -143,8 +143,6 @@
 
 STOPPED_DUE_TO_WATCHPOINT = "Process should be stopped due to watchpoint"
 
-STOPPED_DUE_TO_HISTORY_BOUNDARY = "Process should be stopped due to history boundary"
-
 DATA_TYPES_DISPLAYED_CORRECTLY = "Data type(s) displayed correctly"
 
 VALID_BREAKPOINT = "Got a valid breakpoint"
diff --git a/lldb/source/API/SBProcess.cpp b/lldb/source/API/SBProcess.cpp
index 07780f9f9c8393..9773144723c34c 100644
--- a/lldb/source/API/SBProcess.cpp
+++ b/lldb/source/API/SBProcess.cpp
@@ -564,10 +564,6 @@ uint32_t SBProcess::GetAddressByteSize() const {
 }
 
 SBError SBProcess::Continue() {
-  return Continue(RunDirection::eRunForward);
-}
-
-SBError SBProcess::Continue(RunDirection direction) {
   LLDB_INSTRUMENT_VA(this);
 
   SBError sb_error;
@@ -578,9 +574,9 @@ SBError SBProcess::Continue(RunDirection direction) {
         process_sp->GetTarget().GetAPIMutex());
 
     if (process_sp->GetTarget().GetDebugger().GetAsyncExecution())
-      sb_error.ref() = process_sp->Resume(direction);
+      sb_error.ref() = process_sp->Resume();
     else
-      sb_error.ref() = process_sp->ResumeSynchronous(nullptr, direction);
+      sb_error.ref() = process_sp->ResumeSynchronous(nullptr);
   } else
     sb_error = Status::FromErrorString("SBProcess is invalid");
 
diff --git a/lldb/source/API/SBThread.cpp b/lldb/source/API/SBThread.cpp
index aca8a039952960..a99456e06d0329 100644
--- a/lldb/source/API/SBThread.cpp
+++ b/lldb/source/API/SBThread.cpp
@@ -172,7 +172,6 @@ size_t SBThread::GetStopReasonDataCount() {
         case eStopReasonInstrumentation:
         case eStopReasonProcessorTrace:
         case eStopReasonVForkDone:
-        case eStopReasonHistoryBoundary:
           // There is no data for these stop reasons.
           return 0;
 
@@ -234,7 +233,6 @@ uint64_t SBThread::GetStopReasonDataAtIndex(uint32_t idx) {
         case eStopReasonInstrumentation:
         case eStopReasonProcessorTrace:
         case eStopReasonVForkDone:
-        case eStopReasonHistoryBoundary:
           // There is no data for these stop reasons.
           return 0;
 
diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp
index ea60492ac46a10..8d3a82ef6c990a 100644
--- a/lldb/source/Interpreter/CommandInterpreter.cpp
+++ b/lldb/source/Interpreter/CommandInterpreter.cpp
@@ -2553,8 +2553,7 @@ bool CommandInterpreter::DidProcessStopAbnormally() const {
     const StopReason reason = stop_info->GetStopReason();
     if (reason == eStopReasonException ||
         reason == eStopReasonInstrumentation ||
-        reason == eStopReasonProcessorTrace || reason == eStopReasonInterrupt ||
-        reason == eStopReasonHistoryBoundary)
+        reason == eStopReasonProcessorTrace || reason == eStopReasonInterrupt)
       return true;
 
     if (reason == eStopReasonSignal) {
diff --git a/lldb/source/Plugins/Process/Linux/NativeThreadLinux.cpp b/lldb/source/Plugins/Process/Linux/NativeThreadLinux.cpp
index b0aa664775b463..de047ee214c11e 100644
--- a/lldb/source/Plugins/Process/Linux/NativeThreadLinux.cpp
+++ b/lldb/source/Plugins/Process/Linux/NativeThreadLinux.cpp
@@ -82,9 +82,6 @@ void LogThreadStopInfo(Log &log, const ThreadStopInfo &stop_info,
   case eStopReasonProcessorTrace:
     log.Printf("%s: %s processor trace", __FUNCTION__, header);
     return;
-  case eStopReasonHistoryBoundary:
-    log.Printf("%s: %s history boundary", __FUNCTION__, header);
-    return;
   default:
     log.Printf("%s: %s invalid stop reason %" PRIu32, __FUNCTION__, header,
                static_cast<uint32_t>(stop_info.reason));
diff --git a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp
index 116c43343c01d1..9b2907c6809965 100644
--- a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp
+++ b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp
@@ -402,16 +402,9 @@ lldb_private::DynamicLoader *ProcessKDP::GetDynamicLoader() {
 
 Status ProcessKDP::WillResume() { return Status(); }
 
-Status ProcessKDP::DoResume(RunDirection direction) {
+Status ProcessKDP::DoResume() {
   Status error;
   Log *log = GetLog(KDPLog::Process);
-
-  if (direction == RunDirection::eRunReverse) {
-    error.SetErrorStringWithFormatv(
-        "error: {0} does not support reverse execution of processes", GetPluginName());
-    return error;
-  }
-
   // Only start the async thread if we try to do any process control
   if (!m_async_thread.IsJoinable())
     StartAsyncThread();
diff --git a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.h b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.h
index 1b71d83f70b087..e5ec5914f9600d 100644
--- a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.h
+++ b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.h
@@ -90,7 +90,7 @@ class ProcessKDP : public lldb_private::Process {
   // Process Control
   lldb_private::Status WillResume() override;
 
-  lldb_private::Status DoResume(lldb::RunDirection direction) override;
+  lldb_private::Status DoResume() override;
 
   lldb_private::Status DoHalt(bool &caused_stop) override;
 
diff --git a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp
index 76b7095deaa503..703aa082f0476f 100644
--- a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp
+++ b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.cpp
@@ -204,17 +204,11 @@ ProcessWindows::DoAttachToProcessWithID(lldb::pid_t pid,
   return error;
 }
 
-Status ProcessWindows::DoResume(RunDirection direction) {
+Status ProcessWindows::DoResume() {
   Log *log = GetLog(WindowsLog::Process);
   llvm::sys::ScopedLock lock(m_mutex);
   Status error;
 
-  if (direction == RunDirection::eRunReverse) {
-    error.SetErrorStringWithFormatv(
-        "error: {0} does not support reverse execution of processes", GetPluginName());
-    return error;
-  }
-
   StateType private_state = GetPrivateState();
   if (private_state == eStateStopped || private_state == eStateCrashed) {
     LLDB_LOG(log, "process {0} is in state {1}.  Resuming...",
diff --git a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.h b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.h
index 97284b7cd1436e..e97cfb790248be 100644
--- a/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.h
+++ b/lldb/source/Plugins/Process/Windows/Common/ProcessWindows.h
@@ -52,7 +52,7 @@ class ProcessWindows : public Process, public ProcessDebugger {
   Status DoAttachToProcessWithID(
       lldb::pid_t pid,
       const lldb_private::ProcessAttachInfo &attach_info) override;
-  Status DoResume(lldb::RunDirection direction) override;
+  Status DoResume() override;
   Status DoDestroy() override;
   Status DoHalt(bool &caused_stop) override;
 
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
index fc792a4409410b..e42526c8fd7266 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
@@ -199,20 +199,6 @@ uint64_t GDBRemoteCommunicationClient::GetRemoteMaxPacketSize() {
   return m_max_packet_size;
 }
 
-bool GDBRemoteCommunicationClient::GetReverseContinueSupported() {
-  if (m_supports_reverse_continue == eLazyBoolCalculate) {
-    GetRemoteQSupported();
-  }
-  return m_supports_reverse_continue == eLazyBoolYes;
-}
-
-bool GDBRemoteCommunicationClient::GetReverseStepSupported() {
-  if (m_supports_reverse_step == eLazyBoolCalculate) {
-    GetRemoteQSupported();
-  }
-  return m_supports_reverse_step == eLazyBoolYes;
-}
-
 bool GDBRemoteCommunicationClient::QueryNoAckModeSupported() {
   if (m_supports_not_sending_acks == eLazyBoolCalculate) {
     m_send_acks = true;
@@ -309,8 +295,6 @@ void GDBRemoteCommunicationClient::ResetDiscoverableSettings(bool did_exec) {
     m_supports_qXfer_siginfo_read = eLazyBoolCalculate;
     m_supports_augmented_libraries_svr4_read = eLazyBoolCalculate;
     m_uses_native_signals = eLazyBoolCalculate;
-    m_supports_reverse_continue = eLazyBoolCalculate;
-    m_supports_reverse_step = eLazyBoolCalculate;
     m_supports_qProcessInfoPID = true;
     m_supports_qfProcessInfo = true;
     m_supports_qUserName = true;
@@ -364,8 +348,6 @@ void GDBRemoteCommunicationClient::GetRemoteQSupported() {
   m_supports_memory_tagging = eLazyBoolNo;
   m_supports_qSaveCore = eLazyBoolNo;
   m_uses_native_signals = eLazyBoolNo;
-  m_supports_reverse_continue = eLazyBoolNo;
-  m_supports_reverse_step = eLazyBoolNo;
 
   m_max_packet_size = UINT64_MAX; // It's supposed to always be there, but if
                                   // not, we assume no limit
@@ -419,10 +401,6 @@ void GDBRemoteCommunicationClient::GetRemoteQSupported() {
         m_supports_qSaveCore = eLazyBoolYes;
       else if (x == "native-signals+")
         m_uses_native_signals = eLazyBoolYes;
-      else if (x == "ReverseContinue+")
-        m_supports_reverse_continue = eLazyBoolYes;
-      else if (x == "ReverseStep+")
-        m_supports_reverse_step = eLazyBoolYes;
       // Look for a list of compressions in the features list e.g.
       // qXfer:features:read+;PacketSize=20000;qEcho+;SupportedCompressions=zlib-
       // deflate,lzma
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h
index 116b47c1edf033..898d176abc3465 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.h
@@ -331,10 +331,6 @@ class GDBRemoteCommunicationClient : public GDBRemoteClientBase {
 
   bool GetMultiprocessSupported();
 
-  bool GetReverseContinueSupported();
-
-  bool GetReverseStepSupported();
-
   LazyBool SupportsAllocDeallocMemory() // const
   {
     // Uncomment this to have lldb pretend the debug server doesn't respond to
@@ -565,8 +561,6 @@ class GDBRemoteCommunicationClient : public GDBRemoteClientBase {
   LazyBool m_supports_memory_tagging = eLazyBoolCalculate;
   LazyBool m_supports_qSaveCore = eLazyBoolCalculate;
   LazyBool m_uses_native_signals = eLazyBoolCalculate;
-  LazyBool m_supports_reverse_continue = eLazyBoolCalculate;
-  LazyBool m_supports_reverse_step = eLazyBoolCalculate;
 
   bool m_supports_qProcessInfoPID : 1, m_supports_qfProcessInfo : 1,
       m_supports_qUserName : 1, m_supports_qGroupName : 1,
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp
index 4016cde74ebea8..35fa93e53bc66f 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp
@@ -716,7 +716,6 @@ static const char *GetStopReasonString(StopReason stop_reason) {
     return "vforkdone";
   case eStopReasonInterrupt:
     return "async interrupt";
-  case eStopReasonHistoryBoundary:
   case eStopReasonInstrumentation:
   case eStopReasonInvalid:
   case eStopReasonPlanComplete:
diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
index 3fc03bd05d5df0..3e09c316d74f44 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
@@ -169,10 +169,6 @@ class PluginProperties : public Properties {
   }
 };
 
-std::chrono::seconds ResumeTimeout() {
-  return std::chrono::seconds(5);
-}
-
 } // namespace
 
 static PluginProperties &GetGlobalPluginProperties() {
@@ -1184,11 +1180,10 @@ Status ProcessGDBRemote::WillResume() {
   return Status();
 }
 
-Status ProcessGDBRemote::DoResume(RunDirection direction) {
+Status ProcessGDBRemote::DoResume() {
   Status error;
   Log *log = GetLog(GDBRLog::Process);
-  LLDB_LOGF(log, "ProcessGDBRemote::Resume(%s)",
-            direction == RunDirection::eRunForward ? "" : "reverse");
+  LLDB_LOGF(log, "ProcessGDBRemote::Resume()");
 
   ListenerSP listener_sp(
       Listener::MakeListener("gdb-remote.resume-packet-sent"));
@@ -1202,21 +1197,12 @@ Status ProcessGDBRemote::DoResume(RunDirection direction) {
 
     StreamString continue_packet;
     bool continue_packet_error = false;
-    // Number of threads continuing with "c", i.e. continuing without a signal to deliver.
-    const size_t num_continue_c_tids = m_continue_c_tids.size();
-    // Number of threads continuing with "C", i.e. continuing with a signal to deliver.
-    const size_t num_continue_C_tids = m_continue_C_tids.size();
-    // Number of threads continuing with "s", i.e. single-stepping.
-    const size_t num_continue_s_tids = m_continue_s_tids.size();
-    // Number of threads continuing with "S", i.e. single-stepping with a signal to deliver.
-    const size_t num_continue_S_tids = m_continue_S_tids.size();
-    if (direction == RunDirection::eRunForward &&
-        m_gdb_comm.HasAnyVContSupport()) {
+    if (m_gdb_comm.HasAnyVContSupport()) {
       std::string pid_prefix;
       if (m_gdb_comm.GetMultiprocessSupported())
         pid_prefix = llvm::formatv("p{0:x-}.", GetID());
 
-      if (num_continue_c_tids == num_threads ||
+      if (m_continue_c_tids.size() == num_threads ||
           (m_continue_c_tids.empty() && m_continue_C_tids.empty() &&
            m_continue_s_tids.empty() && m_continue_S_tids.empty())) {
         // All threads are continuing
@@ -1279,11 +1265,14 @@ Status ProcessGDBRemote::DoResume(RunDirection direction) {
     } else
       continue_packet_error = true;
 
-    if (direction == RunDirection::eRunForward && continue_packet_error) {
+    if (continue_packet_error) {
       // Either no vCont support, or we tried to use part of the vCont packet
-      // that wasn't supported by the remote GDB server, or it's the reverse
-      // direction. We need to try and make a simple packet that can do our
-      // continue.
+      // that wasn't supported by the remote GDB server. We need to try and
+      // make a simple packet that can do our continue
+      const size_t num_continue_c_tids = m_continue_c_tids.size();
+      const size_t num_continue_C_tids = m_continue_C_tids.size();
+      const size_t num_continue_s_tids = m_continue_s_tids.size();
+      const size_t num_continue_S_tids = m_continue_S_tids.size();
       if (num_continue_c_tids > 0) {
         if (num_continue_c_tids == num_threads) {
           // All threads are resuming...
@@ -1374,41 +1363,9 @@ Status ProcessGDBRemote::DoResume(RunDirection direction) {
       }
     }
 
-    if (direction == RunDirection::eRunReverse && continue_packet_error) {
-      if (num_continue_C_tids > 0 || num_continue_S_tids > 0) {
-        LLDB_LOGF(log, "ProcessGDBRemote::DoResumeReverse: Signals not supported");
-        return Status::FromErrorString("can't deliver signals while running in reverse");
-      }
-
-      if (num_continue_s_tids > 0) {
-        if (num_continue_s_tids > 1) {
-          LLDB_LOGF(log, "ProcessGDBRemote::DoResumeReverse: can't step multiple threads");
-          return Status::FromErrorString("can't step multiple threads while reverse-stepping");
-        }
-
-        if (!m_gdb_comm.GetReverseStepSupported()) {
-          LLDB_LOGF(log, "ProcessGDBRemote::DoResumeReverse: target does not support reverse-stepping");
-          return Status::FromErrorString("target does not support reverse-stepping");
-        }
-
-        m_gdb_comm.SetCurrentThreadForRun(m_continue_s_tids.front());
-        continue_packet.PutCString("bs");
-      } else {
-        if (!m_gdb_comm.GetReverseContinueSupported()) {
-          LLDB_LOGF(log, "ProcessGDBRemote::DoResumeReverse: target does not support reverse-continue");
-          return Status::FromErrorString("target does not support reverse-continue");
-        }
-
-        // All threads continue whether requested or not ---
-        // we can't change how threads ran in the past.
-        continue_packet.PutCString("bc");
-      }
-
-      continue_packet_error = false;
-    }
-
     if (continue_packet_error) {
-      return Status::FromErrorString("can't make continue packet for this resume");
+      error =
+          Status::FromErrorString("can't make continue packet for this resume");
     } else {
       EventSP event_sp;
       if (!m_async_thread.IsJoinable()) {
@@ -1423,7 +1380,7 @@ Status ProcessGDBRemote::DoResume(RunDirection direction) {
           std::make_shared<EventDataBytes>(continue_packet.GetString());
       m_async_broadcaster.BroadcastEvent(eBroadcastBitAsyncContinue, data_sp);
 
-      if (!listener_sp->GetEvent(event_sp, ResumeTimeout())) {
+      if (!listener_sp->GetEvent(event_sp, std::chrono::seconds(5))) {
         error = Status::FromErrorString("Resume timed out.");
         LLDB_LOGF(log, "ProcessGDBRemote::DoResume: Resume timed out.");
       } else if (event_sp->BroadcasterIs(&m_async_broadcaster)) {
@@ -1906,10 +1863,6 @@ ThreadSP ProcessGDBRemote::SetThreadStopInfo(
           thread_sp->SetStopInfo(StopInfo::CreateStopReasonWithException(
               *thread_sp, description.c_str()));
           handled = true;
-        } else if (reason == "replaylog") {
-          thread_sp->SetStopInfo(StopInfo::CreateStopReasonHistoryBoundary(
-              *thread_sp, description.c_str()));
-          handled = true;
         } else if (reason == "exec") {
           did_exec = true;
           thread_sp->SetStopInfo(
@@ -2365,8 +2318,6 @@ StateType ProcessGDBRemote::SetThreadStopInfo(StringExtractor &stop_packet) {
         description = std::string(ostr.GetString());
       } else if (key.compare("swbreak") == 0 || key.compare("hwbreak") == 0) {
         reason = "breakpoint";
-      } else if (key.compare("replaylog") == 0) {
-        reason = "replaylog";
       } else if (key.compare("library") == 0) {
         auto error = LoadModules();
         if (error) {
diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h
index fa3e1cec76e2b3..2492795851388a 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h
@@ -111,7 +111,7 @@ class ProcessGDBRemote : public Process,
   // Process Control
   Status WillResume() override;
 
-  Status DoResume(lldb::RunDirection direction) override;
+  Status DoResume() override;
 
   Status DoHalt(bool &caused_stop) override;
 
diff --git a/lldb/source/Plugins/Process/scripted/ScriptedProcess.cpp b/lldb/source/Plugins/Process/scripted/ScriptedProcess.cpp
index 304c12173dd35d..d2111ce877ce55 100644
--- a/lldb/source/Plugins/Process/scripted/ScriptedProcess.cpp
+++ b/lldb/source/Plugins/Process/scripted/ScriptedProcess.cpp
@@ -182,15 +182,10 @@ void ScriptedProcess::DidResume() {
   m_pid = GetInterface().GetProcessID();
 }
 
-Status ScriptedProcess::DoResume(RunDirection direction) {
+Status ScriptedProcess::DoResume() {
   LLDB_LOGF(GetLog(LLDBLog::Process), "ScriptedProcess::%s resuming process", __FUNCTION__);
 
-  if (direction == RunDirection::eRunForward) {
-    return GetInterface().Resume();
-  } else {
-    return Status::FromErrorStringWithFormatv(
-        "error: {0} does not support reverse execution of processes", GetPluginName());
-  }
+  return GetInterface().Resume();
 }
 
 Status ScriptedProcess::DoAttach(const ProcessAttachInfo &attach_info) {
diff --git a/lldb/source/Plugins/Process/scripted/ScriptedProcess.h b/lldb/source/Plugins/Process/scripted/ScriptedProcess.h
index 8ebe4ca5f3d449..0335364b4010b2 100644
--- a/lldb/source/Plugins/Process/scripted/ScriptedProcess.h
+++ b/lldb/source/Plugins/Process/scripted/ScriptedProcess.h
@@ -52,7 +52,7 @@ class ScriptedProcess : public Process {
 
   void DidResume() override;
 
-  Status DoResume(lldb::RunDirection direction) override;
+  Status DoResume() override;
 
   Status DoAttachToProcessWithID(lldb::pid_t pid,
                                  const ProcessAttachInfo &attach_info) override;
diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp
index fd683728388215..c009d17d3ba507 100644
--- a/lldb/source/Target/Process.cpp
+++ b/lldb/source/Target/Process.cpp
@@ -446,8 +446,7 @@ Process::Process(lldb::TargetSP target_sp, ListenerSP listener_sp,
       m_memory_cache(*this), m_allocated_memory_cache(*this),
       m_should_detach(false), m_next_event_action_up(), m_public_run_lock(),
       m_private_run_lock(), m_currently_handling_do_on_removals(false),
-      m_resume_requested(false), m_last_run_direction(eRunForward),
-      m_interrupt_tid(LLDB_INVALID_THREAD_ID),
+      m_resume_requested(false), m_interrupt_tid(LLDB_INVALID_THREAD_ID),
       m_finalizing(false), m_destructing(false),
       m_clear_thread_plans_on_stop(false), m_force_next_event_delivery(false),
       m_last_broadcast_state(eStateInvalid), m_destroy_in_process(false),
@@ -846,7 +845,6 @@ bool Process::HandleProcessStateChangedEvent(
             switch (thread_stop_reason) {
             case eStopReasonInvalid:
             case eStopReasonNone:
-            case eStopReasonHistoryBoundary:
               break;
 
             case eStopReasonSignal: {
@@ -1354,7 +1352,7 @@ void Process::SetPublicState(StateType new_state, bool restarted) {
   }
 }
 
-Status Process::Resume(RunDirection direction) {
+Status Process::Resume() {
   Log *log(GetLog(LLDBLog::State | LLDBLog::Process));
   LLDB_LOGF(log, "(plugin = %s) -- locking run lock", GetPluginName().data());
   if (!m_public_run_lock.TrySetRunning()) {
@@ -1363,7 +1361,7 @@ Status Process::Resume(RunDirection direction) {
     return Status::FromErrorString(
         "Resume request failed - process still running.");
   }
-  Status error = PrivateResume(direction);
+  Status error = PrivateResume();
   if (!error.Success()) {
     // Undo running state change
     m_public_run_lock.SetStopped();
@@ -1371,7 +1369,7 @@ Status Process::Resume(RunDirection direction) {
   return error;
 }
 
-Status Process::ResumeSynchronous(Stream *stream, RunDirection direction) {
+Status Process::ResumeSynchronous(Stream *stream) {
   Log *log(GetLog(LLDBLog::State | LLDBLog::Process));
   LLDB_LOGF(log, "Process::ResumeSynchronous -- locking run lock");
   if (!m_public_run_lock.TrySetRunning()) {
@@ -1384,7 +1382,7 @@ Status Process::ResumeSynchronous(Stream *stream, RunDirection direction) {
       Listener::MakeListener(ResumeSynchronousHijackListenerName.data()));
   HijackProcessEvents(listener_sp);
 
-  Status error = PrivateResume(direction);
+  Status error = PrivateResume();
   if (error.Success()) {
     StateType state =
         WaitForProcessToStop(std::nullopt, nullptr, true, listener_sp, stream,
@@ -3241,7 +3239,7 @@ Status Process::ConnectRemote(llvm::StringRef remote_url) {
   return error;
 }
 
-Status Process::PrivateResume(RunDirection direction) {
+Status Process::PrivateResume() {
   Log *log(GetLog(LLDBLog::Process | LLDBLog::Step));
   LLDB_LOGF(log,
             "Process::PrivateResume() m_stop_id = %u, public state: %s "
@@ -3257,15 +3255,6 @@ Status Process::PrivateResume(RunDirection direction) {
   if (!GetModID().IsLastResumeForUserExpression())
     ResetExtendedCrashInfoDict();
 
-  if (m_last_run_direction != direction) {
-    // In the future we might want to support mixed-direction plans,
-    // e.g. a forward step-over stops at a breakpoint, the user does
-    // a reverse-step, then disables the breakpoint and continues forward.
-    // This code will need to be changed to support that.
-    m_thread_list.DiscardThreadPlans();
-    m_last_run_direction = direction;
-  }
-
   Status error(WillResume());
   // Tell the process it is about to resume before the thread list
   if (error.Success()) {
@@ -3283,7 +3272,7 @@ Status Process::PrivateResume(RunDirection direction) {
             "Process::PrivateResume PreResumeActions failed, not resuming.");
       } else {
         m_mod_id.BumpResumeID();
-        error = DoResume(direction);
+        error = DoResume();
         if (error.Success()) {
           DidResume();
           m_thread_list.DidResume();
@@ -3746,7 +3735,7 @@ bool Process::ShouldBroadcastEvent(Event *event_ptr) {
                     "from state: %s",
                     static_cast<void *>(event_ptr), StateAsCString(state));
           ProcessEventData::SetRestartedInEvent(event_ptr, true);
-          PrivateResume(m_last_run_direction);
+          PrivateResume();
         }
       } else {
         return_value = true;
@@ -4357,7 +4346,7 @@ void Process::ProcessEventData::DoOnRemoval(Event *event_ptr) {
     SetRestarted(true);
     // Use the private resume method here, since we aren't changing the run
     // lock state.
-    process_sp->PrivateResume(process_sp->m_last_run_direction);
+    process_sp->PrivateResume();
   } else {
     bool hijacked = process_sp->IsHijackedForEvent(eBroadcastBitStateChanged) &&
                     !process_sp->StateChangedIsHijackedForSynchronousResume();
diff --git a/lldb/source/Target/StopInfo.cpp b/lldb/source/Target/StopInfo.cpp
index 08e9a7c099bad2..bd7032b803df90 100644
--- a/lldb/source/Target/StopInfo.cpp
+++ b/lldb/source/Target/StopInfo.cpp
@@ -1212,30 +1212,6 @@ class StopInfoProcessorTrace : public StopInfo {
   }
 };
 
-// StopInfoHistoryBoundary
-
-class StopInfoHistoryBoundary : public StopInfo {
-public:
-  StopInfoHistoryBoundary(Thread &thread, const char *description)
-      : StopInfo(thread, LLDB_INVALID_UID) {
-    if (description)
-      SetDescription(description);
-  }
-
-  ~StopInfoHistoryBoundary() override = default;
-
-  StopReason GetStopReason() const override {
-    return eStopReasonHistoryBoundary;
-  }
-
-  const char *GetDescription() override {
-    if (m_description.empty())
-      return "history boundary";
-    else
-      return m_description.c_str();
-  }
-};
-
 // StopInfoThreadPlan
 
 class StopInfoThreadPlan : public StopInfo {
@@ -1463,11 +1439,6 @@ StopInfoSP StopInfo::CreateStopReasonProcessorTrace(Thread &thread,
   return StopInfoSP(new StopInfoProcessorTrace(thread, description));
 }
 
-StopInfoSP StopInfo::CreateStopReasonHistoryBoundary(Thread &thread,
-                                                     const char *description) {
-  return StopInfoSP(new StopInfoHistoryBoundary(thread, description));
-}
-
 StopInfoSP StopInfo::CreateStopReasonWithExec(Thread &thread) {
   return StopInfoSP(new StopInfoExec(thread));
 }
diff --git a/lldb/source/Target/Thread.cpp b/lldb/source/Target/Thread.cpp
index bbb586f033b746..902fbb2b519ef7 100644
--- a/lldb/source/Target/Thread.cpp
+++ b/lldb/source/Target/Thread.cpp
@@ -624,12 +624,10 @@ void Thread::SetupForResume() {
     // what the current plan is.
 
     lldb::RegisterContextSP reg_ctx_sp(GetRegisterContext());
-    ProcessSP process_sp(GetProcess());
-    if (reg_ctx_sp && process_sp &&
-        process_sp->GetLastRunDirection() == eRunForward) {
+    if (reg_ctx_sp) {
       const addr_t thread_pc = reg_ctx_sp->GetPC();
       BreakpointSiteSP bp_site_sp =
-          process_sp->GetBreakpointSiteList().FindByAddress(thread_pc);
+          GetProcess()->GetBreakpointSiteList().FindByAddress(thread_pc);
       if (bp_site_sp) {
         // Note, don't assume there's a ThreadPlanStepOverBreakpoint, the
         // target may not require anything special to step over a breakpoint.
@@ -1734,8 +1732,6 @@ std::string Thread::StopReasonAsString(lldb::StopReason reason) {
     return "processor trace";
   case eStopReasonInterrupt:
     return "async interrupt";
-  case eStopReasonHistoryBoundary:
-    return "history boundary";
   }
 
   return "StopReason = " + std::to_string(reason);
diff --git a/lldb/test/API/functionalities/reverse-execution/Makefile b/lldb/test/API/functionalities/reverse-execution/Makefile
deleted file mode 100644
index 10495940055b63..00000000000000
--- a/lldb/test/API/functionalities/reverse-execution/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
-C_SOURCES := main.c
-
-include Makefile.rules
diff --git a/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py b/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py
deleted file mode 100644
index b37578fbd82468..00000000000000
--- a/lldb/test/API/functionalities/reverse-execution/TestReverseContinueBreakpoints.py
+++ /dev/null
@@ -1,115 +0,0 @@
-import lldb
-import time
-import unittest
-from lldbsuite.test.lldbtest import *
-from lldbsuite.test.decorators import *
-from lldbsuite.test.gdbclientutils import *
-from lldbsuite.test.lldbreverse import ReverseTestBase
-from lldbsuite.test import lldbutil
-
-
-class TestReverseContinueBreakpoints(ReverseTestBase):
-    NO_DEBUG_INFO_TESTCASE = True
-
-    def test_reverse_continue(self):
-        self.reverse_continue_internal(async_mode=False)
-
-    def test_reverse_continue_async(self):
-        self.reverse_continue_internal(async_mode=True)
-
-    def reverse_continue_internal(self, async_mode):
-        target, process, initial_threads = self.setup_recording(async_mode)
-
-        # Reverse-continue. We'll stop at the point where we started recording.
-        status = process.Continue(lldb.eRunReverse)
-        self.assertSuccess(status)
-        self.expect_async_state_changes(async_mode, process, [lldb.eStateRunning, lldb.eStateStopped])
-        self.expect(
-            "thread list",
-            STOPPED_DUE_TO_HISTORY_BOUNDARY,
-            substrs=["stopped", "stop reason = history boundary"],
-        )
-
-        # Continue forward normally until the target exits.
-        status = process.Continue()
-        self.expect_async_state_changes(async_mode, process, [lldb.eStateRunning, lldb.eStateExited])
-        self.assertSuccess(status)
-        self.assertState(process.GetState(), lldb.eStateExited)
-        self.assertEqual(process.GetExitStatus(), 0)
-
-    def test_reverse_continue_breakpoint(self):
-        self.reverse_continue_breakpoint_internal(async_mode=False)
-
-    def test_reverse_continue_breakpoint_async(self):
-        self.reverse_continue_breakpoint_internal(async_mode=True)
-
-    def reverse_continue_breakpoint_internal(self, async_mode):
-        target, process, initial_threads = self.setup_recording(async_mode)
-
-        # Reverse-continue to the function "trigger_breakpoint".
-        trigger_bkpt = target.BreakpointCreateByName("trigger_breakpoint", None)
-        status = process.Continue(lldb.eRunReverse)
-        self.assertSuccess(status)
-        self.expect_async_state_changes(async_mode, process, [lldb.eStateRunning, lldb.eStateStopped])
-        threads_now = lldbutil.get_threads_stopped_at_breakpoint(process, trigger_bkpt)
-        self.assertEqual(threads_now, initial_threads)
-
-    def test_reverse_continue_skip_breakpoint(self):
-        self.reverse_continue_skip_breakpoint_internal(async_mode=False)
-
-    def test_reverse_continue_skip_breakpoint_async(self):
-        self.reverse_continue_skip_breakpoint_internal(async_mode=True)
-
-    def reverse_continue_skip_breakpoint_internal(self, async_mode):
-        target, process, initial_threads = self.setup_recording(async_mode)
-
-        # Reverse-continue over a breakpoint at "trigger_breakpoint" whose
-        # condition is false.
-        # This tests that we continue in the correct direction after hitting
-        # the breakpoint.
-        trigger_bkpt = target.BreakpointCreateByName("trigger_breakpoint", None)
-        trigger_bkpt.SetCondition("false_condition")
-        status = process.Continue(lldb.eRunReverse)
-        self.expect_async_state_changes(async_mode, process, [lldb.eStateRunning, lldb.eStateStopped])
-        self.assertSuccess(status)
-        self.expect(
-            "thread list",
-            STOPPED_DUE_TO_HISTORY_BOUNDARY,
-            substrs=["stopped", "stop reason = history boundary"],
-        )
-
-    def setup_recording(self, async_mode):
-        """
-        Record execution of code between "start_recording" and "stop_recording" breakpoints.
-
-        Returns with the target stopped at "stop_recording", with recording disabled,
-        ready to reverse-execute.
-        """
-        self.build()
-        target = self.dbg.CreateTarget("")
-        process = self.connect(target)
-
-        # Record execution from the start of the function "start_recording"
-        # to the start of the function "stop_recording". We want to keep the
-        # interval that we record as small as possible to minimize the run-time
-        # of our single-stepping recorder.
-        start_recording_bkpt = target.BreakpointCreateByName("start_recording", None)
-        initial_threads = lldbutil.continue_to_breakpoint(process, start_recording_bkpt)
-        self.assertEqual(len(initial_threads), 1)
-        target.BreakpointDelete(start_recording_bkpt.GetID())
-        self.start_recording()
-        stop_recording_bkpt = target.BreakpointCreateByName("stop_recording", None)
-        lldbutil.continue_to_breakpoint(process, stop_recording_bkpt)
-        target.BreakpointDelete(stop_recording_bkpt.GetID())
-        self.stop_recording()
-
-        self.dbg.SetAsync(async_mode)
-        self.expect_async_state_changes(async_mode, process, [lldb.eStateStopped])
-
-        return target, process, initial_threads
-
-    def expect_async_state_changes(self, async_mode, process, states):
-        if not async_mode:
-            return
-        listener = self.dbg.GetListener()
-        lldbutil.expect_state_changes(self, listener, process, states)
diff --git a/lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py b/lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py
deleted file mode 100644
index d610761b8cb0bc..00000000000000
--- a/lldb/test/API/functionalities/reverse-execution/TestReverseContinueNotSupported.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import lldb
-import unittest
-from lldbsuite.test.lldbtest import *
-from lldbsuite.test.decorators import *
-from lldbsuite.test import lldbutil
-
-
-class TestReverseContinueNotSupported(TestBase):
-    NO_DEBUG_INFO_TESTCASE = True
-
-    def test_reverse_continue_not_supported(self):
-        self.build()
-        exe = self.getBuildArtifact("a.out")
-        target = self.dbg.CreateTarget(exe)
-        self.assertTrue(target, VALID_TARGET)
-
-        main_bkpt = target.BreakpointCreateByName("main", None)
-        self.assertTrue(main_bkpt, VALID_BREAKPOINT)
-
-        process = target.LaunchSimple(None, None, self.get_process_working_directory())
-        self.assertTrue(process, PROCESS_IS_VALID)
-
-        # This will fail gracefully.
-        status = process.Continue(lldb.eRunReverse)
-        self.assertFailure(status, "target does not support reverse-continue")
-
-        status = process.Continue()
-        self.assertSuccess(status)
-        self.assertState(process.GetState(), lldb.eStateExited)
-        self.assertEqual(process.GetExitStatus(), 0)
diff --git a/lldb/test/API/functionalities/reverse-execution/main.c b/lldb/test/API/functionalities/reverse-execution/main.c
deleted file mode 100644
index 40e45dc9f5c317..00000000000000
--- a/lldb/test/API/functionalities/reverse-execution/main.c
+++ /dev/null
@@ -1,14 +0,0 @@
-volatile int false_condition = 0;
-
-static void start_recording() {}
-
-static void trigger_breakpoint() {}
-
-static void stop_recording() {}
-
-int main() {
-  start_recording();
-  trigger_breakpoint();
-  stop_recording();
-  return 0;
-}
diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp
index 211fd34957f496..558f889c4b7f23 100644
--- a/lldb/tools/lldb-dap/JSONUtils.cpp
+++ b/lldb/tools/lldb-dap/JSONUtils.cpp
@@ -1045,9 +1045,6 @@ llvm::json::Value CreateThreadStopped(lldb::SBThread &thread,
   case lldb::eStopReasonProcessorTrace:
     body.try_emplace("reason", "processor trace");
     break;
-  case lldb::eStopReasonHistoryBoundary:
-    body.try_emplace("reason", "history boundary");
-    break;
   case lldb::eStopReasonSignal:
   case lldb::eStopReasonException:
     body.try_emplace("reason", "exception");
diff --git a/lldb/tools/lldb-dap/LLDBUtils.cpp b/lldb/tools/lldb-dap/LLDBUtils.cpp
index 1c5e3ac7008727..b38833c0fdb6b6 100644
--- a/lldb/tools/lldb-dap/LLDBUtils.cpp
+++ b/lldb/tools/lldb-dap/LLDBUtils.cpp
@@ -111,7 +111,6 @@ bool ThreadHasStopReason(lldb::SBThread &thread) {
   case lldb::eStopReasonVFork:
   case lldb::eStopReasonVForkDone:
   case lldb::eStopReasonInterrupt:
-  case lldb::eStopReasonHistoryBoundary:
     return true;
   case lldb::eStopReasonThreadExiting:
   case lldb::eStopReasonInvalid:

From 2647505027d8c01fc920b04aced8cec742a4b2ed Mon Sep 17 00:00:00 2001
From: Finn Plummer <50529406+inbelic@users.noreply.github.com>
Date: Thu, 10 Oct 2024 16:34:26 -0700
Subject: [PATCH 100/177] [HLSL] Implement the `degrees` intrinsic (#111209)

- add degrees builtin
    - link degrees api in hlsl_intrinsics.h
    - add degrees intrinsic to IntrinsicsDirectX.td
    - add degrees intrinsic to IntrinsicsSPIRV.td
- add lowering from clang builtin to dx/spv intrinsics in CGBuiltin.cpp
    - add semantic checks to SemaHLSL.cpp
- add expansion of directx intrinsic to llvm fmul for DirectX in
DXILIntrinsicExpansion.cpp
    - add mapping to spir-v intrinsic in SPIRVInstructionSelector.cpp

    - add test coverage:
- degrees.hlsl -> check hlsl lowering to dx/spv degrees intrinsics
- degrees-errors.hlsl/half-float-only-errors -> check semantic warnings
- hlsl-intrinsics/degrees.ll -> check lowering of spir-v degrees
intrinsic to SPIR-V backend
- DirectX/degrees.ll -> check expansion and scalarization of directx
degrees intrinsic to fmul

Resolves #99104
---
 clang/include/clang/Basic/Builtins.td         |  6 ++
 clang/lib/CodeGen/CGBuiltin.cpp               | 10 +++
 clang/lib/CodeGen/CGHLSLRuntime.h             |  1 +
 clang/lib/Headers/hlsl/hlsl_intrinsics.h      | 30 +++++++++
 clang/lib/Sema/SemaHLSL.cpp                   |  1 +
 clang/test/CodeGenHLSL/builtins/degrees.hlsl  | 64 +++++++++++++++++++
 .../SemaHLSL/BuiltIns/degrees-errors.hlsl     | 26 ++++++++
 .../BuiltIns/half-float-only-errors.hlsl      |  1 +
 llvm/include/llvm/IR/IntrinsicsDirectX.td     |  1 +
 llvm/include/llvm/IR/IntrinsicsSPIRV.td       |  1 +
 .../Target/DirectX/DXILIntrinsicExpansion.cpp | 12 ++++
 .../Target/SPIRV/SPIRVInstructionSelector.cpp |  2 +
 llvm/test/CodeGen/DirectX/degrees.ll          | 54 ++++++++++++++++
 .../CodeGen/SPIRV/hlsl-intrinsics/degrees.ll  | 52 +++++++++++++++
 llvm/test/CodeGen/SPIRV/opencl/degrees.ll     | 50 +++++++++++++++
 15 files changed, 311 insertions(+)
 create mode 100644 clang/test/CodeGenHLSL/builtins/degrees.hlsl
 create mode 100644 clang/test/SemaHLSL/BuiltIns/degrees-errors.hlsl
 create mode 100644 llvm/test/CodeGen/DirectX/degrees.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/hlsl-intrinsics/degrees.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/opencl/degrees.ll

diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 9ebee81fcb0d3d..7068473a0e12ac 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -4745,6 +4745,12 @@ def HLSLCross: LangBuiltin<"HLSL_LANG"> {
   let Prototype = "void(...)";
 }
 
+def HLSLDegrees : LangBuiltin<"HLSL_LANG"> {
+  let Spellings = ["__builtin_hlsl_elementwise_degrees"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "void(...)";
+}
+
 def HLSLDotProduct : LangBuiltin<"HLSL_LANG"> {
   let Spellings = ["__builtin_hlsl_dot"];
   let Attributes = [NoThrow, Const];
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 06140d6d4ce27b..ff678ee04f9c2a 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18755,6 +18755,16 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
         CGM.getHLSLRuntime().getNormalizeIntrinsic(), ArrayRef<Value *>{X},
         nullptr, "hlsl.normalize");
   }
+  case Builtin::BI__builtin_hlsl_elementwise_degrees: {
+    Value *X = EmitScalarExpr(E->getArg(0));
+
+    assert(E->getArg(0)->getType()->hasFloatingRepresentation() &&
+           "degree operand must have a float representation");
+
+    return Builder.CreateIntrinsic(
+        /*ReturnType=*/X->getType(), CGM.getHLSLRuntime().getDegreesIntrinsic(),
+        ArrayRef<Value *>{X}, nullptr, "hlsl.degrees");
+  }
   case Builtin::BI__builtin_hlsl_elementwise_frac: {
     Value *Op0 = EmitScalarExpr(E->getArg(0));
     if (!E->getArg(0)->getType()->hasFloatingRepresentation())
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h
index 05ff325216f55b..282fa44af212fb 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.h
+++ b/clang/lib/CodeGen/CGHLSLRuntime.h
@@ -75,6 +75,7 @@ class CGHLSLRuntime {
   GENERATE_HLSL_INTRINSIC_FUNCTION(All, all)
   GENERATE_HLSL_INTRINSIC_FUNCTION(Any, any)
   GENERATE_HLSL_INTRINSIC_FUNCTION(Cross, cross)
+  GENERATE_HLSL_INTRINSIC_FUNCTION(Degrees, degrees)
   GENERATE_HLSL_INTRINSIC_FUNCTION(Frac, frac)
   GENERATE_HLSL_INTRINSIC_FUNCTION(Length, length)
   GENERATE_HLSL_INTRINSIC_FUNCTION(Lerp, lerp)
diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
index 813f8a317bf6bf..137467e5a782ce 100644
--- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
@@ -766,6 +766,36 @@ uint64_t3 countbits(uint64_t3);
 _HLSL_BUILTIN_ALIAS(__builtin_elementwise_popcount)
 uint64_t4 countbits(uint64_t4);
 
+//===----------------------------------------------------------------------===//
+// degrees builtins
+//===----------------------------------------------------------------------===//
+
+/// \fn T degrees(T x)
+/// \brief Converts the specified value from radians to degrees.
+/// \param x The specified input value.
+
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_degrees)
+half degrees(half);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_degrees)
+half2 degrees(half2);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_degrees)
+half3 degrees(half3);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_degrees)
+half4 degrees(half4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_degrees)
+float degrees(float);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_degrees)
+float2 degrees(float2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_degrees)
+float3 degrees(float3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_degrees)
+float4 degrees(float4);
+
 //===----------------------------------------------------------------------===//
 // dot product builtins
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index b0acbbbbb2b1f0..137b15c8fcfe98 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -1896,6 +1896,7 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
       return true;
     break;
   }
+  case Builtin::BI__builtin_hlsl_elementwise_degrees:
   case Builtin::BI__builtin_hlsl_elementwise_radians:
   case Builtin::BI__builtin_hlsl_elementwise_rsqrt:
   case Builtin::BI__builtin_hlsl_elementwise_frac: {
diff --git a/clang/test/CodeGenHLSL/builtins/degrees.hlsl b/clang/test/CodeGenHLSL/builtins/degrees.hlsl
new file mode 100644
index 00000000000000..9e131f4badc19a
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/degrees.hlsl
@@ -0,0 +1,64 @@
+// RUN: %clang_cc1 -finclude-default-header -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF \
+// RUN:   -DFNATTRS=noundef -DTARGET=dx
+// RUN: %clang_cc1 -finclude-default-header -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
+// RUN:   -DFNATTRS=noundef -DTARGET=dx
+// RUN: %clang_cc1 -finclude-default-header -triple \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF \
+// RUN:   -DFNATTRS="spir_func noundef" -DTARGET=spv
+// RUN: %clang_cc1 -finclude-default-header -triple \
+// RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
+// RUN:   -DFNATTRS="spir_func noundef" -DTARGET=spv
+
+// NATIVE_HALF: define [[FNATTRS]] half @
+// NATIVE_HALF: %hlsl.degrees = call half @llvm.[[TARGET]].degrees.f16(
+// NATIVE_HALF: ret half %hlsl.degrees
+// NO_HALF: define [[FNATTRS]] float @
+// NO_HALF: %hlsl.degrees = call float @llvm.[[TARGET]].degrees.f32(
+// NO_HALF: ret float %hlsl.degrees
+half test_degrees_half(half p0) { return degrees(p0); }
+// NATIVE_HALF: define [[FNATTRS]] <2 x half> @
+// NATIVE_HALF: %hlsl.degrees = call <2 x half> @llvm.[[TARGET]].degrees.v2f16
+// NATIVE_HALF: ret <2 x half> %hlsl.degrees
+// NO_HALF: define [[FNATTRS]] <2 x float> @
+// NO_HALF: %hlsl.degrees = call <2 x float> @llvm.[[TARGET]].degrees.v2f32(
+// NO_HALF: ret <2 x float> %hlsl.degrees
+half2 test_degrees_half2(half2 p0) { return degrees(p0); }
+// NATIVE_HALF: define [[FNATTRS]] <3 x half> @
+// NATIVE_HALF: %hlsl.degrees = call <3 x half> @llvm.[[TARGET]].degrees.v3f16
+// NATIVE_HALF: ret <3 x half> %hlsl.degrees
+// NO_HALF: define [[FNATTRS]] <3 x float> @
+// NO_HALF: %hlsl.degrees = call <3 x float> @llvm.[[TARGET]].degrees.v3f32(
+// NO_HALF: ret <3 x float> %hlsl.degrees
+half3 test_degrees_half3(half3 p0) { return degrees(p0); }
+// NATIVE_HALF: define [[FNATTRS]] <4 x half> @
+// NATIVE_HALF: %hlsl.degrees = call <4 x half> @llvm.[[TARGET]].degrees.v4f16
+// NATIVE_HALF: ret <4 x half> %hlsl.degrees
+// NO_HALF: define [[FNATTRS]] <4 x float> @
+// NO_HALF: %hlsl.degrees = call <4 x float> @llvm.[[TARGET]].degrees.v4f32(
+// NO_HALF: ret <4 x float> %hlsl.degrees
+half4 test_degrees_half4(half4 p0) { return degrees(p0); }
+
+// CHECK: define [[FNATTRS]] float @
+// CHECK: %hlsl.degrees = call float @llvm.[[TARGET]].degrees.f32(
+// CHECK: ret float %hlsl.degrees
+float test_degrees_float(float p0) { return degrees(p0); }
+// CHECK: define [[FNATTRS]] <2 x float> @
+// CHECK: %hlsl.degrees = call <2 x float> @llvm.[[TARGET]].degrees.v2f32
+// CHECK: ret <2 x float> %hlsl.degrees
+float2 test_degrees_float2(float2 p0) { return degrees(p0); }
+// CHECK: define [[FNATTRS]] <3 x float> @
+// CHECK: %hlsl.degrees = call <3 x float> @llvm.[[TARGET]].degrees.v3f32
+// CHECK: ret <3 x float> %hlsl.degrees
+float3 test_degrees_float3(float3 p0) { return degrees(p0); }
+// CHECK: define [[FNATTRS]] <4 x float> @
+// CHECK: %hlsl.degrees = call <4 x float> @llvm.[[TARGET]].degrees.v4f32
+// CHECK: ret <4 x float> %hlsl.degrees
+float4 test_degrees_float4(float4 p0) { return degrees(p0); }
diff --git a/clang/test/SemaHLSL/BuiltIns/degrees-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/degrees-errors.hlsl
new file mode 100644
index 00000000000000..9e981f6973572d
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/degrees-errors.hlsl
@@ -0,0 +1,26 @@
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -emit-llvm-only -disable-llvm-passes -verify
+
+float test_too_few_arg() {
+  return __builtin_hlsl_elementwise_degrees();
+  // expected-error@-1 {{too few arguments to function call, expected 1, have 0}}
+}
+
+float2 test_too_many_arg(float2 p0) {
+  return __builtin_hlsl_elementwise_degrees(p0, p0);
+  // expected-error@-1 {{too many arguments to function call, expected 1, have 2}}
+}
+
+float builtin_bool_to_float_type_promotion(bool p1) {
+  return __builtin_hlsl_elementwise_degrees(p1);
+  // expected-error@-1 {{passing 'bool' to parameter of incompatible type 'float'}}
+}
+
+float builtin_degrees_int_to_float_promotion(int p1) {
+  return __builtin_hlsl_elementwise_degrees(p1);
+  // expected-error@-1 {{passing 'int' to parameter of incompatible type 'float'}}
+}
+
+float2 builtin_degrees_int2_to_float2_promotion(int2 p1) {
+  return __builtin_hlsl_elementwise_degrees(p1);
+  // expected-error@-1 {{passing 'int2' (aka 'vector<int, 2>') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}}
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/half-float-only-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/half-float-only-errors.hlsl
index 2cecf7aeb00e46..cdd130052b6a67 100644
--- a/clang/test/SemaHLSL/BuiltIns/half-float-only-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/half-float-only-errors.hlsl
@@ -17,6 +17,7 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_tan
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_tanh
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_trunc
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_hlsl_elementwise_degrees
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_hlsl_elementwise_radians
 
 double test_double_builtin(double p0) {
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
index 1cf6acbf126475..45aea1ccdb6d4c 100644
--- a/llvm/include/llvm/IR/IntrinsicsDirectX.td
+++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -71,6 +71,7 @@ def int_dx_udot :
     [IntrNoMem, Commutative] >;
 
 def int_dx_frac  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+def int_dx_degrees : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty], [IntrNoMem]>;
 
 def int_dx_isinf : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
     [llvm_anyfloat_ty], [IntrNoMem]>;
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index 92d2f67399d263..3d61456589ee0d 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -62,6 +62,7 @@ let TargetPrefix = "spv" in {
   def int_spv_all : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_any_ty], [IntrNoMem]>;
   def int_spv_any : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_any_ty], [IntrNoMem]>;
   def int_spv_cross : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+  def int_spv_degrees : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty], [IntrNoMem]>;
   def int_spv_frac : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty], [IntrNoMem]>;
   def int_spv_lerp : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty, LLVMMatchType<0>,LLVMMatchType<0>],
     [IntrNoMem] >;
diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
index 1e84a7216013da..fb5383b3514a5a 100644
--- a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
+++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
@@ -56,6 +56,7 @@ static bool isIntrinsicExpansion(Function &F) {
   case Intrinsic::dx_clamp:
   case Intrinsic::dx_cross:
   case Intrinsic::dx_uclamp:
+  case Intrinsic::dx_degrees:
   case Intrinsic::dx_lerp:
   case Intrinsic::dx_length:
   case Intrinsic::dx_normalize:
@@ -490,6 +491,14 @@ static Value *expandClampIntrinsic(CallInst *Orig,
                                  {MaxCall, Max}, nullptr, "dx.min");
 }
 
+static Value *expandDegreesIntrinsic(CallInst *Orig) {
+  Value *X = Orig->getOperand(0);
+  Type *Ty = X->getType();
+  IRBuilder<> Builder(Orig);
+  Value *DegreesRatio = ConstantFP::get(Ty, 180.0 * llvm::numbers::inv_pi);
+  return Builder.CreateFMul(X, DegreesRatio);
+}
+
 static Value *expandSignIntrinsic(CallInst *Orig) {
   Value *X = Orig->getOperand(0);
   Type *Ty = X->getType();
@@ -549,6 +558,9 @@ static bool expandIntrinsic(Function &F, CallInst *Orig) {
   case Intrinsic::dx_clamp:
     Result = expandClampIntrinsic(Orig, IntrinsicId);
     break;
+  case Intrinsic::dx_degrees:
+    Result = expandDegreesIntrinsic(Orig);
+    break;
   case Intrinsic::dx_lerp:
     Result = expandLerpIntrinsic(Orig);
     break;
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index e8b769b6fd6900..fd92346717c415 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -2513,6 +2513,8 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
     return selectExtInst(ResVReg, ResType, I, CL::mix, GL::FMix);
   case Intrinsic::spv_length:
     return selectExtInst(ResVReg, ResType, I, CL::length, GL::Length);
+  case Intrinsic::spv_degrees:
+    return selectExtInst(ResVReg, ResType, I, CL::degrees, GL::Degrees);
   case Intrinsic::spv_frac:
     return selectExtInst(ResVReg, ResType, I, CL::fract, GL::Fract);
   case Intrinsic::spv_normalize:
diff --git a/llvm/test/CodeGen/DirectX/degrees.ll b/llvm/test/CodeGen/DirectX/degrees.ll
new file mode 100644
index 00000000000000..b38ac13d5f24e2
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/degrees.ll
@@ -0,0 +1,54 @@
+; RUN: opt -S -dxil-intrinsic-expansion -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+
+; Make sure dxil op function calls for degrees are expanded and lowered as fmul for float and half.
+
+define noundef half @degrees_half(half noundef %a) {
+; CHECK-LABEL: define noundef half @degrees_half(
+; CHECK-SAME: half noundef [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DX_DEGREES1:%.*]] = fmul half [[A]], 0xH5329
+; CHECK-NEXT:    ret half [[DX_DEGREES1]]
+;
+entry:
+  %dx.degrees = call half @llvm.dx.degrees.f16(half %a)
+  ret half %dx.degrees
+}
+
+define noundef float @degrees_float(float noundef %a) #0 {
+; CHECK-LABEL: define noundef float @degrees_float(
+; CHECK-SAME: float noundef [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DEGREES:%.*]] = fmul float [[A]], 0x404CA5DC20000000
+; CHECK-NEXT:    ret float [[DEGREES]]
+;
+entry:
+  %dx.degrees = call float @llvm.dx.degrees.f32(float %a)
+  ret float %dx.degrees
+}
+
+define noundef <4 x float> @degrees_float4(<4 x float> noundef %a) #0 {
+; CHECK-LABEL: define noundef <4 x float> @degrees_float4(
+; CHECK-SAME: <4 x float> noundef [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A]], i64 0
+; CHECK-NEXT:    [[DEGREES_A0:%.*]] = fmul float [[A0]], 0x404CA5DC20000000
+; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i64 1
+; CHECK-NEXT:    [[DEGREES_A1:%.*]] = fmul float [[A1]], 0x404CA5DC20000000
+; CHECK-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i64 2
+; CHECK-NEXT:    [[DEGREES_A2:%.*]] = fmul float [[A2]], 0x404CA5DC20000000
+; CHECK-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i64 3
+; CHECK-NEXT:    [[DEGREES_A3:%.*]] = fmul float [[A3]], 0x404CA5DC20000000
+; CHECK-NEXT:    [[INSERT_0:%.*]] = insertelement <4 x float> poison, float [[DEGREES_A0]], i64 0
+; CHECK-NEXT:    [[INSERT_1:%.*]] = insertelement <4 x float> [[INSERT_0]], float [[DEGREES_A1]], i64 1
+; CHECK-NEXT:    [[INSERT_2:%.*]] = insertelement <4 x float> [[INSERT_1]], float [[DEGREES_A2]], i64 2
+; CHECK-NEXT:    [[RES:%.*]] = insertelement <4 x float> [[INSERT_2]], float [[DEGREES_A3]], i64 3
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+entry:
+  %2 = call <4 x float> @llvm.dx.degrees.v4f32(<4 x float> %a)
+  ret <4 x float> %2
+}
+
+declare half  @llvm.dx.degrees.f16(half)
+declare float @llvm.dx.degrees.f32(float)
+declare <4 x float> @llvm.dx.degrees.v4f32(<4 x float>)
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/degrees.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/degrees.ll
new file mode 100644
index 00000000000000..533bcca6f62169
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/degrees.ll
@@ -0,0 +1,52 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450"
+
+; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32
+; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16
+
+; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4
+; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4
+
+; CHECK-LABEL: Begin function degrees_float
+define noundef float @degrees_float(float noundef %a) {
+entry:
+; CHECK: %[[#float_32_arg:]] = OpFunctionParameter %[[#float_32]]
+; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_glsl]] Degrees %[[#float_32_arg]]
+  %elt.degrees = call float @llvm.spv.degrees.f32(float %a)
+  ret float %elt.degrees
+}
+
+; CHECK-LABEL: Begin function degrees_half
+define noundef half @degrees_half(half noundef %a) {
+entry:
+; CHECK: %[[#float_16_arg:]] = OpFunctionParameter %[[#float_16]]
+; CHECK: %[[#]] = OpExtInst %[[#float_16]] %[[#op_ext_glsl]] Degrees %[[#float_16_arg]]
+  %elt.degrees = call half @llvm.spv.degrees.f16(half %a)
+  ret half %elt.degrees
+}
+
+; CHECK-LABEL: Begin function degrees_float_vector
+define noundef <4 x float> @degrees_float_vector(<4 x float> noundef %a) {
+entry:
+; CHECK: %[[#vec4_float_32_arg:]] = OpFunctionParameter %[[#vec4_float_32]]
+; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_glsl]] Degrees %[[#vec4_float_32_arg]]
+  %elt.degrees = call <4 x float> @llvm.spv.degrees.v4f32(<4 x float> %a)
+  ret <4 x float> %elt.degrees
+}
+
+; CHECK-LABEL: Begin function degrees_half_vector
+define noundef <4 x half> @degrees_half_vector(<4 x half> noundef %a) {
+entry:
+; CHECK: %[[#vec4_float_16_arg:]] = OpFunctionParameter %[[#vec4_float_16]]
+; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_glsl]] Degrees %[[#vec4_float_16_arg]]
+  %elt.degrees = call <4 x half> @llvm.spv.degrees.v4f16(<4 x half> %a)
+  ret <4 x half> %elt.degrees
+}
+
+declare half @llvm.spv.degrees.f16(half)
+declare float @llvm.spv.degrees.f32(float)
+
+declare <4 x float> @llvm.spv.degrees.v4f32(<4 x float>)
+declare <4 x half> @llvm.spv.degrees.v4f16(<4 x half>)
diff --git a/llvm/test/CodeGen/SPIRV/opencl/degrees.ll b/llvm/test/CodeGen/SPIRV/opencl/degrees.ll
new file mode 100644
index 00000000000000..88f97835fe7194
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/opencl/degrees.ll
@@ -0,0 +1,50 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "OpenCL.std"
+
+; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32
+; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16
+
+; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4
+; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4
+
+declare half @llvm.spv.degrees.f16(half)
+declare float @llvm.spv.degrees.f32(float)
+
+declare <4 x float> @llvm.spv.degrees.v4f32(<4 x float>)
+declare <4 x half> @llvm.spv.degrees.v4f16(<4 x half>)
+
+define noundef float @degrees_float(float noundef %a) {
+entry:
+; CHECK: %[[#float_32_arg:]] = OpFunctionParameter %[[#float_32]]
+; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_glsl]] degrees %[[#float_32_arg]]
+  %elt.degrees = call float @llvm.spv.degrees.f32(float %a)
+  ret float %elt.degrees
+}
+
+define noundef half @degrees_half(half noundef %a) {
+entry:
+; CHECK: %[[#float_16_arg:]] = OpFunctionParameter %[[#float_16]]
+; CHECK: %[[#]] = OpExtInst %[[#float_16]] %[[#op_ext_glsl]] degrees %[[#float_16_arg]]
+  %elt.degrees = call half @llvm.spv.degrees.f16(half %a)
+  ret half %elt.degrees
+}
+
+define noundef <4 x float> @degrees_float_vector(<4 x float> noundef %a) {
+entry:
+; CHECK: %[[#vec4_float_32_arg:]] = OpFunctionParameter %[[#vec4_float_32]]
+; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_glsl]] degrees %[[#vec4_float_32_arg]]
+  %elt.degrees = call <4 x float> @llvm.spv.degrees.v4f32(<4 x float> %a)
+  ret <4 x float> %elt.degrees
+}
+
+define noundef <4 x half> @degrees_half_vector(<4 x half> noundef %a) {
+entry:
+; CHECK: %[[#vec4_float_16_arg:]] = OpFunctionParameter %[[#vec4_float_16]]
+; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_glsl]] degrees %[[#vec4_float_16_arg]]
+  %elt.degrees = call <4 x half> @llvm.spv.degrees.v4f16(<4 x half> %a)
+  ret <4 x half> %elt.degrees
+}

From 6640dac22b567e5f6c328ca56cf9bf43d45509e6 Mon Sep 17 00:00:00 2001
From: Keith Smiley <keithbsmiley@gmail.com>
Date: Thu, 10 Oct 2024 16:43:29 -0700
Subject: [PATCH 101/177] [bazel] Add include-cleaner tests (#111924)

---
 .../include-cleaner/BUILD.bazel               | 70 +++++++++++++++++++
 1 file changed, 70 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/clang-tools-extra/include-cleaner/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang-tools-extra/include-cleaner/BUILD.bazel
index 28f90efb3ba7f3..5b210ad80c8740 100644
--- a/utils/bazel/llvm-project-overlay/clang-tools-extra/include-cleaner/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang-tools-extra/include-cleaner/BUILD.bazel
@@ -2,7 +2,9 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
 load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library")
+load("//llvm:lit_test.bzl", "lit_test", "package_path")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -61,3 +63,71 @@ cc_binary(
         "//llvm:Support",
     ],
 )
+
+cc_test(
+    name = "unittests",
+    srcs = glob(["unittests/*.cpp"]),
+    deps = [
+        ":include_cleaner",
+        ":include_cleaner_internal",
+        "//clang:ast",
+        "//clang:basic",
+        "//clang:format",
+        "//clang:frontend",
+        "//clang:lex",
+        "//clang:serialization",
+        "//clang:testing",
+        "//clang:tooling",
+        "//clang:tooling_inclusions",
+        "//llvm:Support",
+        "//llvm:TestingAnnotations",
+        "//third-party/unittest:gmock",
+        "//third-party/unittest:gtest",
+    ],
+)
+
+LLVM_LIT_PATH_FUNCTION = " " + \
+                         "# Allow generated file to be relocatable.\n" + \
+                         "from pathlib import Path\n" + \
+                         "def path(p):\n" + \
+                         "    p = Path(p)\n" + \
+                         "    if p.exists: return str(p.resolve())\n" + \
+                         "    if not p: return ''\n" + \
+                         "    return str((Path(__file__).parent / p).resolve())\n"
+
+LIT_SITE_CFG_IN_HEADER = "# Autogenerated, do not edit." + LLVM_LIT_PATH_FUNCTION
+
+expand_template(
+    name = "lit_site_cfg_py",
+    testonly = True,
+    out = "test/lit.site.cfg.py",
+    substitutions = {
+        "@CMAKE_CURRENT_BINARY_DIR@": package_path("//clang-tools-extra/include-cleaner:BUILD") + "/test",
+        "@CMAKE_CURRENT_SOURCE_DIR@": package_path("//clang-tools-extra/include-cleaner:BUILD") + "/test",
+        "@CURRENT_TOOLS_DIR@": package_path("//clang-tools-extra/include-cleaner:BUILD"),
+        "@LIT_SITE_CFG_IN_HEADER@": LIT_SITE_CFG_IN_HEADER,
+        "@LLVM_LIBS_DIR@": package_path("//llvm:BUILD"),
+        "@LLVM_LIT_TOOLS_DIR@": package_path("//llvm:BUILD"),
+        "@LLVM_TOOLS_DIR@": package_path("//llvm:BUILD"),
+        "@TARGET_TRIPLE@": "",
+        '"@Python3_EXECUTABLE@"': "sys.executable",
+    },
+    template = "test/lit.site.cfg.py.in",
+)
+
+[
+    lit_test(
+        name = "%s.test" % src,
+        srcs = [src],
+        data = glob(["test/Inputs/**/*"]) + [
+            "test/lit.cfg.py",
+            "test/lit.site.cfg.py",
+            ":clang-include-cleaner",
+            "//llvm:FileCheck",
+            "//llvm:count",
+            "//llvm:not",
+        ],
+        args = ["-svv"],
+    )
+    for src in glob(["test/*.cpp"])
+]

From 1037f577bd66ab03bc494120f024f2a52008e285 Mon Sep 17 00:00:00 2001
From: Igor Kudrin <ikudrin@accesssoftek.com>
Date: Fri, 11 Oct 2024 06:51:27 +0700
Subject: [PATCH 102/177] [lld][elf] Warn if '*' pattern is used multiple times
 in version scripts (#102669)

If this pattern is used more than once in version script(s), only one
will have an effect, so it's probably a user error and can be diagnosed.
---
 lld/ELF/SymbolTable.cpp                     | 34 ++++++++++++++++++--
 lld/test/ELF/version-script-reassign-glob.s |  4 ++-
 lld/test/ELF/version-script-warn.s          | 35 +++++++++++++++++++++
 3 files changed, 70 insertions(+), 3 deletions(-)
 create mode 100644 lld/test/ELF/version-script-warn.s

diff --git a/lld/ELF/SymbolTable.cpp b/lld/ELF/SymbolTable.cpp
index db8ee8f4d7b3bb..b9ef28f0436f88 100644
--- a/lld/ELF/SymbolTable.cpp
+++ b/lld/ELF/SymbolTable.cpp
@@ -309,13 +309,43 @@ void SymbolTable::scanVersionScript() {
 
   // Then, assign versions to "*". In GNU linkers they have lower priority than
   // other wildcards.
+  bool globalAsteriskFound = false;
+  bool localAsteriskFound = false;
+  bool asteriskReported = false;
+  auto assignAsterisk = [&](SymbolVersion &pat, VersionDefinition *ver,
+                            bool isLocal) {
+    // Avoid issuing a warning if both '--retain-symbol-file' and a version
+    // script with `global: *` are used.
+    //
+    // '--retain-symbol-file' adds a "*" pattern to
+    // 'config->versionDefinitions[VER_NDX_LOCAL].nonLocalPatterns', see
+    // 'readConfigs()' in 'Driver.cpp'. Note that it is not '.localPatterns',
+    // and may seem counterintuitive, but still works as expected. Here we can
+    // exploit that and skip analyzing the pattern added for this option.
+    if (!asteriskReported && (isLocal || ver->id > VER_NDX_LOCAL)) {
+      if ((isLocal && globalAsteriskFound) ||
+          (!isLocal && localAsteriskFound)) {
+        warn("wildcard pattern '*' is used for both 'local' and 'global' "
+             "scopes in version script");
+        asteriskReported = true;
+      } else if (!isLocal && globalAsteriskFound) {
+        warn("wildcard pattern '*' is used for multiple version definitions in "
+             "version script");
+        asteriskReported = true;
+      } else {
+        localAsteriskFound = isLocal;
+        globalAsteriskFound = !isLocal;
+      }
+    }
+    assignWildcard(pat, isLocal ? VER_NDX_LOCAL : ver->id, ver->name);
+  };
   for (VersionDefinition &v : llvm::reverse(ctx.arg.versionDefinitions)) {
     for (SymbolVersion &pat : v.nonLocalPatterns)
       if (pat.hasWildcard && pat.name == "*")
-        assignWildcard(pat, v.id, v.name);
+        assignAsterisk(pat, &v, false);
     for (SymbolVersion &pat : v.localPatterns)
       if (pat.hasWildcard && pat.name == "*")
-        assignWildcard(pat, VER_NDX_LOCAL, v.name);
+        assignAsterisk(pat, &v, true);
   }
 
   // Symbol themselves might know their versions because symbols
diff --git a/lld/test/ELF/version-script-reassign-glob.s b/lld/test/ELF/version-script-reassign-glob.s
index 39d19a26fc4498..8de36467bd8ee6 100644
--- a/lld/test/ELF/version-script-reassign-glob.s
+++ b/lld/test/ELF/version-script-reassign-glob.s
@@ -10,7 +10,8 @@
 # RUN: llvm-readelf --dyn-syms %t.so | FileCheck --check-prefix=BAR %s
 
 # RUN: echo 'bar1 { *; }; bar2 { *; };' > %t2.ver
-# RUN: ld.lld --version-script %t2.ver %t.o -shared -o %t2.so --fatal-warnings
+# RUN: ld.lld --version-script %t2.ver %t.o -shared -o %t2.so 2>&1 | \
+# RUN:   FileCheck --check-prefix=DUPWARN %s
 # RUN: llvm-readelf --dyn-syms %t2.so | FileCheck --check-prefix=BAR2 %s
 
 ## If both a non-* glob and a * match, non-* wins.
@@ -21,6 +22,7 @@
 
 ## When there are multiple * patterns, the last wins.
 # BAR2: GLOBAL DEFAULT 7 foo@@bar2
+# DUPWARN: warning: wildcard pattern '*' is used for multiple version definitions in version script
 
 .globl foo
 foo:
diff --git a/lld/test/ELF/version-script-warn.s b/lld/test/ELF/version-script-warn.s
new file mode 100644
index 00000000000000..9aba596165796b
--- /dev/null
+++ b/lld/test/ELF/version-script-warn.s
@@ -0,0 +1,35 @@
+# REQUIRES: x86
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o
+
+# RUN: echo 'foo { *; }; bar { *; };' > %t.ver
+# RUN: ld.lld --version-script %t.ver %t.o -shared -o %t.so 2>&1 | \
+# RUN:   FileCheck --check-prefix=MULTVER %s
+
+# RUN: echo '{ global: *; local: *;};' > %t.ver
+# RUN: ld.lld --version-script %t.ver %t.o -shared -o %t.so 2>&1 | \
+# RUN:   FileCheck --check-prefix=LOCGLOB %s
+
+# RUN: echo 'V1 { global: *; }; V2 { local: *;};' > %t.ver
+# RUN: ld.lld --version-script %t.ver %t.o -shared -o %t.so 2>&1 | \
+# RUN:   FileCheck --check-prefix=LOCGLOB %s
+
+# RUN: echo 'V1 { local: *; }; V2 { global: *;};' > %t.ver
+# RUN: ld.lld --version-script %t.ver %t.o -shared -o %t.so 2>&1 | \
+# RUN:   FileCheck --check-prefix=LOCGLOB %s
+
+# RUN: echo 'V1 { local: *; }; V2 { local: *;};' > %t.ver
+# RUN: ld.lld --version-script %t.ver %t.o -shared -o %t.so --fatal-warnings
+
+## --retain-symbols-file uses the same internal infrastructure as the support
+## for version scripts. Do not show the warings if they both are used.
+# RUN: echo 'foo' > %t_retain.txt
+# RUN: echo '{ local: *; };' > %t_local.ver
+# RUN: echo '{ global: *; };' > %t_global.ver
+# RUN: ld.lld --retain-symbols-file=%t_retain.txt --version-script %t_local.ver %t.o -shared -o %t.so --fatal-warnings
+# RUN: ld.lld --retain-symbols-file=%t_retain.txt --version-script %t_global.ver %t.o -shared -o %t.so --fatal-warnings
+
+# MULTVER: warning: wildcard pattern '*' is used for multiple version definitions in version script
+# LOCGLOB: warning: wildcard pattern '*' is used for both 'local' and 'global' scopes in version script
+
+.globl foo
+foo:

From 0add1741d58e4b8d6cbc5f50e1fac86296680e5b Mon Sep 17 00:00:00 2001
From: Keith Smiley <keithbsmiley@gmail.com>
Date: Thu, 10 Oct 2024 16:55:41 -0700
Subject: [PATCH 103/177] [bazel] Port e9c8f75d45ababe7f805078bbf7bda2e7425f1b7
 (#111928)

---
 .../bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel  | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
index 96202bf47b8486..38493411addebf 100644
--- a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
@@ -2248,6 +2248,8 @@ cc_library(
     hdrs = glob(["Process/minidump/*.h"]),
     include_prefix = "Plugins",
     deps = [
+        ":PluginDynamicLoaderPosixDYLD",
+        ":PluginDynamicLoaderPosixDYLDHeaders",
         ":PluginObjectFilePlaceholder",
         ":PluginProcessElfCore",
         ":PluginProcessUtility",

From 774c953cf8f8ff2fe45b07f388a687748b775878 Mon Sep 17 00:00:00 2001
From: yronglin <yronglin777@gmail.com>
Date: Fri, 11 Oct 2024 08:15:27 +0800
Subject: [PATCH 104/177] [NFC][clang] Fix typo in ReleaseNotes (#111930)

Fix a typo in ReleaseNotes that introduced by
https://github.com/llvm/llvm-project/pull/86960.

Signed-off-by: yronglin <yronglin777@gmail.com>
---
 clang/docs/ReleaseNotes.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index e48835d4738007..df165b91252505 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -172,7 +172,7 @@ C++23 Feature Support
 - Removed the restriction to literal types in constexpr functions in C++23 mode.
 
 - Extend lifetime of temporaries in mem-default-init for P2718R0. Clang now fully
-  supported `P2718R0 Lifetime extension in range-based for loops <https://wg21.link/P2718R0>`_.
+  supports `P2718R0 Lifetime extension in range-based for loops <https://wg21.link/P2718R0>`_.
 
 C++20 Feature Support
 ^^^^^^^^^^^^^^^^^^^^^

From 9c81a2476566b068ef54fd51ab2540933542b2a6 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 10 Oct 2024 17:44:57 -0700
Subject: [PATCH 105/177] [asan] Prevent printing invalid parent thread
 (#111916)

By default reuse can happend only after
`UINT32_MAX` threads, so it's almost NFC.
---
 compiler-rt/lib/asan/asan_descriptions.cpp | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/compiler-rt/lib/asan/asan_descriptions.cpp b/compiler-rt/lib/asan/asan_descriptions.cpp
index 1c2f20a76343bb..674fe9c1e90be0 100644
--- a/compiler-rt/lib/asan/asan_descriptions.cpp
+++ b/compiler-rt/lib/asan/asan_descriptions.cpp
@@ -48,9 +48,20 @@ void DescribeThread(AsanThreadContext *context) {
     return;
   }
   context->announced = true;
+
+  AsanThreadContext *parent_context =
+      context->parent_tid == kInvalidTid
+          ? nullptr
+          : GetThreadContextByTidLocked(context->parent_tid);
+
+  // `context->parent_tid` may point to reused slot. Check `unique_id` which
+  // is always smaller for the parent, always greater for a new user.
+  if (context->unique_id <= parent_context->unique_id)
+    parent_context = nullptr;
+
   InternalScopedString str;
   str.AppendF("Thread %s", AsanThreadIdAndName(context).c_str());
-  if (context->parent_tid == kInvalidTid) {
+  if (!parent_context) {
     str.Append(" created by unknown thread\n");
     Printf("%s", str.data());
     return;
@@ -60,11 +71,8 @@ void DescribeThread(AsanThreadContext *context) {
   Printf("%s", str.data());
   StackDepotGet(context->stack_id).Print();
   // Recursively described parent thread if needed.
-  if (flags()->print_full_thread_history) {
-    AsanThreadContext *parent_context =
-        GetThreadContextByTidLocked(context->parent_tid);
+  if (flags()->print_full_thread_history)
     DescribeThread(parent_context);
-  }
 }
 
 // Shadow descriptions

From 72fb37922577997f3666203dbdb2601f0fc97748 Mon Sep 17 00:00:00 2001
From: YunQiang Su <yunqiang@isrc.iscas.ac.cn>
Date: Fri, 11 Oct 2024 08:45:14 +0800
Subject: [PATCH 106/177] AArch64: Select FCANONICALIZE (#104429)

FMINNM/FMAXNM instructions of AArch64 follow IEEE754-2008. We can use
them to canonicalize a floating point number. And
FMINNUM_IEEE/FMAXNUM_IEEE is used by something like expanding
FMINIMUMNUM/FMAXIMUMNUM, so let's define them.

---------

Co-authored-by: Your Name <you@example.com>
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |   3 +
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  53 +-
 llvm/test/CodeGen/AArch64/fp-fcanonicalize.ll | 587 ++++++++++++++++++
 .../AArch64/fp-maximumnum-minimumnum.ll       | 560 +++++++++++++++++
 4 files changed, 1188 insertions(+), 15 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/fp-fcanonicalize.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c1aefee3793c96..8a217cd1ec5cf9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -775,6 +775,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
              ISD::FMAXNUM,
              ISD::FMINIMUM,
              ISD::FMAXIMUM,
+             ISD::FCANONICALIZE,
              ISD::STRICT_FADD,
              ISD::STRICT_FSUB,
              ISD::STRICT_FMUL,
@@ -818,6 +819,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationPromotedToType(ISD::FROUNDEVEN, V4Narrow, MVT::v4f32);
     setOperationPromotedToType(ISD::FRINT,      V4Narrow, MVT::v4f32);
     setOperationPromotedToType(ISD::FNEARBYINT, V4Narrow, MVT::v4f32);
+    setOperationPromotedToType(ISD::FCANONICALIZE, V4Narrow, MVT::v4f32);
 
     setOperationAction(ISD::FABS,        V4Narrow, Legal);
     setOperationAction(ISD::FNEG, 	 V4Narrow, Legal);
@@ -851,6 +853,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::SELECT,      V8Narrow, Expand);
     setOperationAction(ISD::SELECT_CC,   V8Narrow, Expand);
     setOperationAction(ISD::FP_EXTEND,   V8Narrow, Expand);
+    setOperationPromotedToType(ISD::FCANONICALIZE, V8Narrow, MVT::v8f32);
   };
 
   if (!Subtarget->hasFullFP16()) {
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 33d05d6039b096..325508b62a9f14 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -5052,17 +5052,25 @@ def : Pat<(v1f64 (fminnum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
 
 def : Pat<(fminnum_ieee (f64 FPR64:$a), (f64 FPR64:$b)),
           (FMINNMDrr FPR64:$a, FPR64:$b)>;
-def : Pat<(fminnum_ieee (f32 FPR32:$a), (f32 FPR32:$b)),
-          (FMINNMSrr FPR32:$a, FPR32:$b)>;
-def : Pat<(fminnum_ieee (f16 FPR16:$a), (f16 FPR16:$b)),
-          (FMINNMHrr FPR16:$a, FPR16:$b)>;
 def : Pat<(fmaxnum_ieee (f64 FPR64:$a), (f64 FPR64:$b)),
           (FMAXNMDrr FPR64:$a, FPR64:$b)>;
+def : Pat<(f64 (fcanonicalize f64:$a)),
+          (FMINNMDrr   f64:$a, f64:$a)>;
+def : Pat<(fminnum_ieee (f32 FPR32:$a), (f32 FPR32:$b)),
+          (FMINNMSrr FPR32:$a, FPR32:$b)>;
 def : Pat<(fmaxnum_ieee (f32 FPR32:$a), (f32 FPR32:$b)),
           (FMAXNMSrr FPR32:$a, FPR32:$b)>;
+def : Pat<(f32 (fcanonicalize f32:$a)),
+          (FMINNMSrr   f32:$a, f32:$a)>;
+
+let Predicates = [HasFullFP16] in {
+def : Pat<(fminnum_ieee (f16 FPR16:$a), (f16 FPR16:$b)),
+          (FMINNMHrr FPR16:$a, FPR16:$b)>;
 def : Pat<(fmaxnum_ieee (f16 FPR16:$a), (f16 FPR16:$b)),
           (FMAXNMHrr FPR16:$a, FPR16:$b)>;
-
+def : Pat<(f16 (fcanonicalize f16:$a)),
+          (FMINNMHrr   f16:$a, f16:$a)>;
+}
 //===----------------------------------------------------------------------===//
 // Floating point three operand instructions.
 //===----------------------------------------------------------------------===//
@@ -5567,26 +5575,41 @@ defm FMINNM  : SIMDThreeSameVectorFP<0,1,0b000,"fminnm", any_fminnum>;
 defm FMINP   : SIMDThreeSameVectorFP<1,1,0b110,"fminp", int_aarch64_neon_fminp>;
 defm FMIN    : SIMDThreeSameVectorFP<0,1,0b110,"fmin", any_fminimum>;
 
+let Predicates = [HasNEON] in {
 def : Pat<(v2f64 (fminnum_ieee (v2f64 V128:$Rn), (v2f64 V128:$Rm))),
           (v2f64 (FMINNMv2f64 (v2f64 V128:$Rn), (v2f64 V128:$Rm)))>;
-def : Pat<(v4f32 (fminnum_ieee (v4f32 V128:$Rn), (v4f32 V128:$Rm))),
-          (v4f32 (FMINNMv4f32 (v4f32 V128:$Rn), (v4f32 V128:$Rm)))>;
-def : Pat<(v8f16 (fminnum_ieee (v8f16 V128:$Rn), (v8f16 V128:$Rm))),
-          (v8f16 (FMINNMv8f16 (v8f16 V128:$Rn), (v8f16 V128:$Rm)))>;
-def : Pat<(v2f32 (fminnum_ieee (v2f32 V64:$Rn), (v2f32 V64:$Rm))),
-          (v2f32 (FMINNMv2f32 (v2f32 V64:$Rn), (v2f32 V64:$Rm)))>;
-def : Pat<(v4f16 (fminnum_ieee (v4f16 V64:$Rn), (v4f16 V64:$Rm))),
-          (v4f16 (FMINNMv4f16 (v4f16 V64:$Rn), (v4f16 V64:$Rm)))>;
 def : Pat<(v2f64 (fmaxnum_ieee (v2f64 V128:$Rn), (v2f64 V128:$Rm))),
           (v2f64 (FMAXNMv2f64 (v2f64 V128:$Rn), (v2f64 V128:$Rm)))>;
+def : Pat<(v2f64 (fcanonicalize (v2f64 V128:$Rn))),
+          (v2f64 (FMINNMv2f64 (v2f64 V128:$Rn), (v2f64 V128:$Rn)))>;
+def : Pat<(v4f32 (fminnum_ieee (v4f32 V128:$Rn), (v4f32 V128:$Rm))),
+          (v4f32 (FMINNMv4f32 (v4f32 V128:$Rn), (v4f32 V128:$Rm)))>;
 def : Pat<(v4f32 (fmaxnum_ieee (v4f32 V128:$Rn), (v4f32 V128:$Rm))),
           (v4f32 (FMAXNMv4f32 (v4f32 V128:$Rn), (v4f32 V128:$Rm)))>;
-def : Pat<(v8f16 (fmaxnum_ieee (v8f16 V128:$Rn), (v8f16 V128:$Rm))),
-          (v8f16 (FMAXNMv8f16 (v8f16 V128:$Rn), (v8f16 V128:$Rm)))>;
+def : Pat<(v4f32 (fcanonicalize (v4f32 V128:$Rn))),
+          (v4f32 (FMINNMv4f32 (v4f32 V128:$Rn), (v4f32 V128:$Rn)))>;
+def : Pat<(v2f32 (fminnum_ieee (v2f32 V64:$Rn), (v2f32 V64:$Rm))),
+          (v2f32 (FMINNMv2f32 (v2f32 V64:$Rn), (v2f32 V64:$Rm)))>;
 def : Pat<(v2f32 (fmaxnum_ieee (v2f32 V64:$Rn), (v2f32 V64:$Rm))),
           (v2f32 (FMAXNMv2f32 (v2f32 V64:$Rn), (v2f32 V64:$Rm)))>;
+def : Pat<(v2f32 (fcanonicalize (v2f32 V64:$Rn))),
+          (v2f32 (FMINNMv2f32 (v2f32 V64:$Rn), (v2f32 V64:$Rn)))>;
+}
+
+let Predicates = [HasNEON, HasFullFP16] in {
+def : Pat<(v8f16 (fminnum_ieee (v8f16 V128:$Rn), (v8f16 V128:$Rm))),
+          (v8f16 (FMINNMv8f16 (v8f16 V128:$Rn), (v8f16 V128:$Rm)))>;
+def : Pat<(v8f16 (fmaxnum_ieee (v8f16 V128:$Rn), (v8f16 V128:$Rm))),
+          (v8f16 (FMAXNMv8f16 (v8f16 V128:$Rn), (v8f16 V128:$Rm)))>;
+def : Pat<(v8f16 (fcanonicalize (v8f16 V128:$Rn))),
+          (v8f16 (FMINNMv8f16 (v8f16 V128:$Rn), (v8f16 V128:$Rn)))>;
+def : Pat<(v4f16 (fminnum_ieee (v4f16 V64:$Rn), (v4f16 V64:$Rm))),
+          (v4f16 (FMINNMv4f16 (v4f16 V64:$Rn), (v4f16 V64:$Rm)))>;
 def : Pat<(v4f16 (fmaxnum_ieee (v4f16 V64:$Rn), (v4f16 V64:$Rm))),
           (v4f16 (FMAXNMv4f16 (v4f16 V64:$Rn), (v4f16 V64:$Rm)))>;
+def : Pat<(v4f16 (fcanonicalize (v4f16 V64:$Rn))),
+          (v4f16 (FMINNMv4f16 (v4f16 V64:$Rn), (v4f16 V64:$Rn)))>;
+}
 
 // NOTE: The operands of the PatFrag are reordered on FMLA/FMLS because the
 // instruction expects the addend first, while the fma intrinsic puts it last.
diff --git a/llvm/test/CodeGen/AArch64/fp-fcanonicalize.ll b/llvm/test/CodeGen/AArch64/fp-fcanonicalize.ll
new file mode 100644
index 00000000000000..753e2b73433994
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/fp-fcanonicalize.ll
@@ -0,0 +1,587 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=aarch64 --mattr=-fullfp16,-neon < %s | FileCheck %s --check-prefix=CHECK-NOFP16-NONEON
+; RUN: llc --mtriple=aarch64 --mattr=+fullfp16,-neon < %s | FileCheck %s --check-prefix=CHECK-FP16-NONEON
+; RUN: llc --mtriple=aarch64 --mattr=-fullfp16,+neon < %s | FileCheck %s --check-prefix=CHECK-NOFP16-NEON
+; RUN: llc --mtriple=aarch64 --mattr=+fullfp16,+neon < %s | FileCheck %s --check-prefixes=CHECK-FP16-NEON
+
+declare half @llvm.fcanonicalize.f16(half)
+declare float @llvm.fcanonicalize.f32(float)
+declare double @llvm.fcanonicalize.f64(double)
+
+define half @fcanonicalize_f16(half %x) {
+; CHECK-NOFP16-NONEON-LABEL: fcanonicalize_f16:
+; CHECK-NOFP16-NONEON:       // %bb.0:
+; CHECK-NOFP16-NONEON-NEXT:    fcvt s0, h0
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s0, s0, s0
+; CHECK-NOFP16-NONEON-NEXT:    fcvt h0, s0
+; CHECK-NOFP16-NONEON-NEXT:    ret
+;
+; CHECK-FP16-NONEON-LABEL: fcanonicalize_f16:
+; CHECK-FP16-NONEON:       // %bb.0:
+; CHECK-FP16-NONEON-NEXT:    fminnm h0, h0, h0
+; CHECK-FP16-NONEON-NEXT:    ret
+;
+; CHECK-NOFP16-NEON-LABEL: fcanonicalize_f16:
+; CHECK-NOFP16-NEON:       // %bb.0:
+; CHECK-NOFP16-NEON-NEXT:    fcvt s0, h0
+; CHECK-NOFP16-NEON-NEXT:    fminnm s0, s0, s0
+; CHECK-NOFP16-NEON-NEXT:    fcvt h0, s0
+; CHECK-NOFP16-NEON-NEXT:    ret
+;
+; CHECK-FP16-NEON-LABEL: fcanonicalize_f16:
+; CHECK-FP16-NEON:       // %bb.0:
+; CHECK-FP16-NEON-NEXT:    fminnm h0, h0, h0
+; CHECK-FP16-NEON-NEXT:    ret
+  %z = call half @llvm.canonicalize.f16(half %x)
+  ret half %z
+}
+
+define half @fcanonicalize_f16_nnan(half %x) {
+; CHECK-NOFP16-NONEON-LABEL: fcanonicalize_f16_nnan:
+; CHECK-NOFP16-NONEON:       // %bb.0:
+; CHECK-NOFP16-NONEON-NEXT:    fcvt s0, h0
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s0, s0, s0
+; CHECK-NOFP16-NONEON-NEXT:    fcvt h0, s0
+; CHECK-NOFP16-NONEON-NEXT:    ret
+;
+; CHECK-FP16-NONEON-LABEL: fcanonicalize_f16_nnan:
+; CHECK-FP16-NONEON:       // %bb.0:
+; CHECK-FP16-NONEON-NEXT:    fminnm h0, h0, h0
+; CHECK-FP16-NONEON-NEXT:    ret
+;
+; CHECK-NOFP16-NEON-LABEL: fcanonicalize_f16_nnan:
+; CHECK-NOFP16-NEON:       // %bb.0:
+; CHECK-NOFP16-NEON-NEXT:    fcvt s0, h0
+; CHECK-NOFP16-NEON-NEXT:    fminnm s0, s0, s0
+; CHECK-NOFP16-NEON-NEXT:    fcvt h0, s0
+; CHECK-NOFP16-NEON-NEXT:    ret
+;
+; CHECK-FP16-NEON-LABEL: fcanonicalize_f16_nnan:
+; CHECK-FP16-NEON:       // %bb.0:
+; CHECK-FP16-NEON-NEXT:    fminnm h0, h0, h0
+; CHECK-FP16-NEON-NEXT:    ret
+  %z = call nnan half @llvm.canonicalize.f16(half %x)
+  ret half %z
+}
+
+define <2 x half> @fcanonicalize_v2f16(<2 x half> %x) {
+; CHECK-NOFP16-NONEON-LABEL: fcanonicalize_v2f16:
+; CHECK-NOFP16-NONEON:       // %bb.0:
+; CHECK-NOFP16-NONEON-NEXT:    fcvt s0, h0
+; CHECK-NOFP16-NONEON-NEXT:    fcvt s1, h1
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s0, s0, s0
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s1, s1, s1
+; CHECK-NOFP16-NONEON-NEXT:    fcvt h0, s0
+; CHECK-NOFP16-NONEON-NEXT:    fcvt h1, s1
+; CHECK-NOFP16-NONEON-NEXT:    ret
+;
+; CHECK-FP16-NONEON-LABEL: fcanonicalize_v2f16:
+; CHECK-FP16-NONEON:       // %bb.0:
+; CHECK-FP16-NONEON-NEXT:    fminnm h0, h0, h0
+; CHECK-FP16-NONEON-NEXT:    fminnm h1, h1, h1
+; CHECK-FP16-NONEON-NEXT:    ret
+;
+; CHECK-NOFP16-NEON-LABEL: fcanonicalize_v2f16:
+; CHECK-NOFP16-NEON:       // %bb.0:
+; CHECK-NOFP16-NEON-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-NOFP16-NEON-NEXT:    fminnm v0.4s, v0.4s, v0.4s
+; CHECK-NOFP16-NEON-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-NOFP16-NEON-NEXT:    ret
+;
+; CHECK-FP16-NEON-LABEL: fcanonicalize_v2f16:
+; CHECK-FP16-NEON:       // %bb.0:
+; CHECK-FP16-NEON-NEXT:    fminnm v0.4h, v0.4h, v0.4h
+; CHECK-FP16-NEON-NEXT:    ret
+  %z = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %x)
+  ret <2 x half> %z
+}
+
+define <2 x half> @fcanonicalize_v2f16_nnan(<2 x half> %x) {
+; CHECK-NOFP16-NONEON-LABEL: fcanonicalize_v2f16_nnan:
+; CHECK-NOFP16-NONEON:       // %bb.0:
+; CHECK-NOFP16-NONEON-NEXT:    fcvt s0, h0
+; CHECK-NOFP16-NONEON-NEXT:    fcvt s1, h1
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s0, s0, s0
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s1, s1, s1
+; CHECK-NOFP16-NONEON-NEXT:    fcvt h0, s0
+; CHECK-NOFP16-NONEON-NEXT:    fcvt h1, s1
+; CHECK-NOFP16-NONEON-NEXT:    ret
+;
+; CHECK-FP16-NONEON-LABEL: fcanonicalize_v2f16_nnan:
+; CHECK-FP16-NONEON:       // %bb.0:
+; CHECK-FP16-NONEON-NEXT:    fminnm h0, h0, h0
+; CHECK-FP16-NONEON-NEXT:    fminnm h1, h1, h1
+; CHECK-FP16-NONEON-NEXT:    ret
+;
+; CHECK-NOFP16-NEON-LABEL: fcanonicalize_v2f16_nnan:
+; CHECK-NOFP16-NEON:       // %bb.0:
+; CHECK-NOFP16-NEON-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-NOFP16-NEON-NEXT:    fminnm v0.4s, v0.4s, v0.4s
+; CHECK-NOFP16-NEON-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-NOFP16-NEON-NEXT:    ret
+;
+; CHECK-FP16-NEON-LABEL: fcanonicalize_v2f16_nnan:
+; CHECK-FP16-NEON:       // %bb.0:
+; CHECK-FP16-NEON-NEXT:    fminnm v0.4h, v0.4h, v0.4h
+; CHECK-FP16-NEON-NEXT:    ret
+  %z = call nnan <2 x half> @llvm.canonicalize.v2f16(<2 x half> %x)
+  ret <2 x half> %z
+}
+
+define <4 x half> @fcanonicalize_v4f16(<4 x half> %x) {
+; CHECK-NOFP16-NONEON-LABEL: fcanonicalize_v4f16:
+; CHECK-NOFP16-NONEON:       // %bb.0:
+; CHECK-NOFP16-NONEON-NEXT:    fcvt s0, h0
+; CHECK-NOFP16-NONEON-NEXT:    fcvt s1, h1
+; CHECK-NOFP16-NONEON-NEXT:    fcvt s2, h2
+; CHECK-NOFP16-NONEON-NEXT:    fcvt s3, h3
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s0, s0, s0
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s1, s1, s1
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s2, s2, s2
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s3, s3, s3
+; CHECK-NOFP16-NONEON-NEXT:    fcvt h0, s0
+; CHECK-NOFP16-NONEON-NEXT:    fcvt h1, s1
+; CHECK-NOFP16-NONEON-NEXT:    fcvt h2, s2
+; CHECK-NOFP16-NONEON-NEXT:    fcvt h3, s3
+; CHECK-NOFP16-NONEON-NEXT:    ret
+;
+; CHECK-FP16-NONEON-LABEL: fcanonicalize_v4f16:
+; CHECK-FP16-NONEON:       // %bb.0:
+; CHECK-FP16-NONEON-NEXT:    fminnm h0, h0, h0
+; CHECK-FP16-NONEON-NEXT:    fminnm h1, h1, h1
+; CHECK-FP16-NONEON-NEXT:    fminnm h2, h2, h2
+; CHECK-FP16-NONEON-NEXT:    fminnm h3, h3, h3
+; CHECK-FP16-NONEON-NEXT:    ret
+;
+; CHECK-NOFP16-NEON-LABEL: fcanonicalize_v4f16:
+; CHECK-NOFP16-NEON:       // %bb.0:
+; CHECK-NOFP16-NEON-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-NOFP16-NEON-NEXT:    fminnm v0.4s, v0.4s, v0.4s
+; CHECK-NOFP16-NEON-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-NOFP16-NEON-NEXT:    ret
+;
+; CHECK-FP16-NEON-LABEL: fcanonicalize_v4f16:
+; CHECK-FP16-NEON:       // %bb.0:
+; CHECK-FP16-NEON-NEXT:    fminnm v0.4h, v0.4h, v0.4h
+; CHECK-FP16-NEON-NEXT:    ret
+  %z = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %x)
+  ret <4 x half> %z
+}
+
+define <4 x half> @fcanonicalize_v4f16_nnan(<4 x half> %x) {
+; CHECK-NOFP16-NONEON-LABEL: fcanonicalize_v4f16_nnan:
+; CHECK-NOFP16-NONEON:       // %bb.0:
+; CHECK-NOFP16-NONEON-NEXT:    fcvt s0, h0
+; CHECK-NOFP16-NONEON-NEXT:    fcvt s1, h1
+; CHECK-NOFP16-NONEON-NEXT:    fcvt s2, h2
+; CHECK-NOFP16-NONEON-NEXT:    fcvt s3, h3
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s0, s0, s0
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s1, s1, s1
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s2, s2, s2
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s3, s3, s3
+; CHECK-NOFP16-NONEON-NEXT:    fcvt h0, s0
+; CHECK-NOFP16-NONEON-NEXT:    fcvt h1, s1
+; CHECK-NOFP16-NONEON-NEXT:    fcvt h2, s2
+; CHECK-NOFP16-NONEON-NEXT:    fcvt h3, s3
+; CHECK-NOFP16-NONEON-NEXT:    ret
+;
+; CHECK-FP16-NONEON-LABEL: fcanonicalize_v4f16_nnan:
+; CHECK-FP16-NONEON:       // %bb.0:
+; CHECK-FP16-NONEON-NEXT:    fminnm h0, h0, h0
+; CHECK-FP16-NONEON-NEXT:    fminnm h1, h1, h1
+; CHECK-FP16-NONEON-NEXT:    fminnm h2, h2, h2
+; CHECK-FP16-NONEON-NEXT:    fminnm h3, h3, h3
+; CHECK-FP16-NONEON-NEXT:    ret
+;
+; CHECK-NOFP16-NEON-LABEL: fcanonicalize_v4f16_nnan:
+; CHECK-NOFP16-NEON:       // %bb.0:
+; CHECK-NOFP16-NEON-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-NOFP16-NEON-NEXT:    fminnm v0.4s, v0.4s, v0.4s
+; CHECK-NOFP16-NEON-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-NOFP16-NEON-NEXT:    ret
+;
+; CHECK-FP16-NEON-LABEL: fcanonicalize_v4f16_nnan:
+; CHECK-FP16-NEON:       // %bb.0:
+; CHECK-FP16-NEON-NEXT:    fminnm v0.4h, v0.4h, v0.4h
+; CHECK-FP16-NEON-NEXT:    ret
+  %z = call nnan <4 x half> @llvm.canonicalize.v4f16(<4 x half> %x)
+  ret <4 x half> %z
+}
+
+define <8 x half> @fcanonicalize_v8f16(<8 x half> %x) {
+; CHECK-NOFP16-NONEON-LABEL: fcanonicalize_v8f16:
+; CHECK-NOFP16-NONEON:       // %bb.0:
+; CHECK-NOFP16-NONEON-NEXT:    fcvt s0, h0
+; CHECK-NOFP16-NONEON-NEXT:    fcvt s1, h1
+; CHECK-NOFP16-NONEON-NEXT:    fcvt s2, h2
+; CHECK-NOFP16-NONEON-NEXT:    fcvt s3, h3
+; CHECK-NOFP16-NONEON-NEXT:    fcvt s4, h4
+; CHECK-NOFP16-NONEON-NEXT:    fcvt s5, h5
+; CHECK-NOFP16-NONEON-NEXT:    fcvt s6, h6
+; CHECK-NOFP16-NONEON-NEXT:    fcvt s7, h7
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s0, s0, s0
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s1, s1, s1
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s2, s2, s2
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s3, s3, s3
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s4, s4, s4
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s5, s5, s5
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s6, s6, s6
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s7, s7, s7
+; CHECK-NOFP16-NONEON-NEXT:    fcvt h0, s0
+; CHECK-NOFP16-NONEON-NEXT:    fcvt h1, s1
+; CHECK-NOFP16-NONEON-NEXT:    fcvt h2, s2
+; CHECK-NOFP16-NONEON-NEXT:    fcvt h3, s3
+; CHECK-NOFP16-NONEON-NEXT:    fcvt h4, s4
+; CHECK-NOFP16-NONEON-NEXT:    fcvt h5, s5
+; CHECK-NOFP16-NONEON-NEXT:    fcvt h6, s6
+; CHECK-NOFP16-NONEON-NEXT:    fcvt h7, s7
+; CHECK-NOFP16-NONEON-NEXT:    ret
+;
+; CHECK-FP16-NONEON-LABEL: fcanonicalize_v8f16:
+; CHECK-FP16-NONEON:       // %bb.0:
+; CHECK-FP16-NONEON-NEXT:    fminnm h0, h0, h0
+; CHECK-FP16-NONEON-NEXT:    fminnm h1, h1, h1
+; CHECK-FP16-NONEON-NEXT:    fminnm h2, h2, h2
+; CHECK-FP16-NONEON-NEXT:    fminnm h3, h3, h3
+; CHECK-FP16-NONEON-NEXT:    fminnm h4, h4, h4
+; CHECK-FP16-NONEON-NEXT:    fminnm h5, h5, h5
+; CHECK-FP16-NONEON-NEXT:    fminnm h6, h6, h6
+; CHECK-FP16-NONEON-NEXT:    fminnm h7, h7, h7
+; CHECK-FP16-NONEON-NEXT:    ret
+;
+; CHECK-NOFP16-NEON-LABEL: fcanonicalize_v8f16:
+; CHECK-NOFP16-NEON:       // %bb.0:
+; CHECK-NOFP16-NEON-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-NOFP16-NEON-NEXT:    fcvtl2 v2.4s, v0.8h
+; CHECK-NOFP16-NEON-NEXT:    fminnm v1.4s, v1.4s, v1.4s
+; CHECK-NOFP16-NEON-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-NOFP16-NEON-NEXT:    fminnm v1.4s, v2.4s, v2.4s
+; CHECK-NOFP16-NEON-NEXT:    fcvtn2 v0.8h, v1.4s
+; CHECK-NOFP16-NEON-NEXT:    ret
+;
+; CHECK-FP16-NEON-LABEL: fcanonicalize_v8f16:
+; CHECK-FP16-NEON:       // %bb.0:
+; CHECK-FP16-NEON-NEXT:    fminnm v0.8h, v0.8h, v0.8h
+; CHECK-FP16-NEON-NEXT:    ret
+  %z = call <8 x half> @llvm.canonicalize.v8f16(<8 x half> %x)
+  ret <8 x half> %z
+}
+
+define <8 x half> @fcanonicalize_v8f16_nnan(<8 x half> %x) {
+; CHECK-NOFP16-NONEON-LABEL: fcanonicalize_v8f16_nnan:
+; CHECK-NOFP16-NONEON:       // %bb.0:
+; CHECK-NOFP16-NONEON-NEXT:    fcvt s0, h0
+; CHECK-NOFP16-NONEON-NEXT:    fcvt s1, h1
+; CHECK-NOFP16-NONEON-NEXT:    fcvt s2, h2
+; CHECK-NOFP16-NONEON-NEXT:    fcvt s3, h3
+; CHECK-NOFP16-NONEON-NEXT:    fcvt s4, h4
+; CHECK-NOFP16-NONEON-NEXT:    fcvt s5, h5
+; CHECK-NOFP16-NONEON-NEXT:    fcvt s6, h6
+; CHECK-NOFP16-NONEON-NEXT:    fcvt s7, h7
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s0, s0, s0
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s1, s1, s1
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s2, s2, s2
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s3, s3, s3
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s4, s4, s4
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s5, s5, s5
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s6, s6, s6
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s7, s7, s7
+; CHECK-NOFP16-NONEON-NEXT:    fcvt h0, s0
+; CHECK-NOFP16-NONEON-NEXT:    fcvt h1, s1
+; CHECK-NOFP16-NONEON-NEXT:    fcvt h2, s2
+; CHECK-NOFP16-NONEON-NEXT:    fcvt h3, s3
+; CHECK-NOFP16-NONEON-NEXT:    fcvt h4, s4
+; CHECK-NOFP16-NONEON-NEXT:    fcvt h5, s5
+; CHECK-NOFP16-NONEON-NEXT:    fcvt h6, s6
+; CHECK-NOFP16-NONEON-NEXT:    fcvt h7, s7
+; CHECK-NOFP16-NONEON-NEXT:    ret
+;
+; CHECK-FP16-NONEON-LABEL: fcanonicalize_v8f16_nnan:
+; CHECK-FP16-NONEON:       // %bb.0:
+; CHECK-FP16-NONEON-NEXT:    fminnm h0, h0, h0
+; CHECK-FP16-NONEON-NEXT:    fminnm h1, h1, h1
+; CHECK-FP16-NONEON-NEXT:    fminnm h2, h2, h2
+; CHECK-FP16-NONEON-NEXT:    fminnm h3, h3, h3
+; CHECK-FP16-NONEON-NEXT:    fminnm h4, h4, h4
+; CHECK-FP16-NONEON-NEXT:    fminnm h5, h5, h5
+; CHECK-FP16-NONEON-NEXT:    fminnm h6, h6, h6
+; CHECK-FP16-NONEON-NEXT:    fminnm h7, h7, h7
+; CHECK-FP16-NONEON-NEXT:    ret
+;
+; CHECK-NOFP16-NEON-LABEL: fcanonicalize_v8f16_nnan:
+; CHECK-NOFP16-NEON:       // %bb.0:
+; CHECK-NOFP16-NEON-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-NOFP16-NEON-NEXT:    fcvtl2 v2.4s, v0.8h
+; CHECK-NOFP16-NEON-NEXT:    fminnm v1.4s, v1.4s, v1.4s
+; CHECK-NOFP16-NEON-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-NOFP16-NEON-NEXT:    fminnm v1.4s, v2.4s, v2.4s
+; CHECK-NOFP16-NEON-NEXT:    fcvtn2 v0.8h, v1.4s
+; CHECK-NOFP16-NEON-NEXT:    ret
+;
+; CHECK-FP16-NEON-LABEL: fcanonicalize_v8f16_nnan:
+; CHECK-FP16-NEON:       // %bb.0:
+; CHECK-FP16-NEON-NEXT:    fminnm v0.8h, v0.8h, v0.8h
+; CHECK-FP16-NEON-NEXT:    ret
+  %z = call nnan <8 x half> @llvm.canonicalize.v8f16(<8 x half> %x)
+  ret <8 x half> %z
+}
+
+define float @fcanonicalize_f32(float %x) {
+; CHECK-NOFP16-NONEON-LABEL: fcanonicalize_f32:
+; CHECK-NOFP16-NONEON:       // %bb.0:
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s0, s0, s0
+; CHECK-NOFP16-NONEON-NEXT:    ret
+;
+; CHECK-FP16-NONEON-LABEL: fcanonicalize_f32:
+; CHECK-FP16-NONEON:       // %bb.0:
+; CHECK-FP16-NONEON-NEXT:    fminnm s0, s0, s0
+; CHECK-FP16-NONEON-NEXT:    ret
+;
+; CHECK-NOFP16-NEON-LABEL: fcanonicalize_f32:
+; CHECK-NOFP16-NEON:       // %bb.0:
+; CHECK-NOFP16-NEON-NEXT:    fminnm s0, s0, s0
+; CHECK-NOFP16-NEON-NEXT:    ret
+;
+; CHECK-FP16-NEON-LABEL: fcanonicalize_f32:
+; CHECK-FP16-NEON:       // %bb.0:
+; CHECK-FP16-NEON-NEXT:    fminnm s0, s0, s0
+; CHECK-FP16-NEON-NEXT:    ret
+  %z = call float @llvm.canonicalize.f32(float %x)
+  ret float %z
+}
+
+define float @fcanonicalize_f32_nnan(float %x) {
+; CHECK-NOFP16-NONEON-LABEL: fcanonicalize_f32_nnan:
+; CHECK-NOFP16-NONEON:       // %bb.0:
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s0, s0, s0
+; CHECK-NOFP16-NONEON-NEXT:    ret
+;
+; CHECK-FP16-NONEON-LABEL: fcanonicalize_f32_nnan:
+; CHECK-FP16-NONEON:       // %bb.0:
+; CHECK-FP16-NONEON-NEXT:    fminnm s0, s0, s0
+; CHECK-FP16-NONEON-NEXT:    ret
+;
+; CHECK-NOFP16-NEON-LABEL: fcanonicalize_f32_nnan:
+; CHECK-NOFP16-NEON:       // %bb.0:
+; CHECK-NOFP16-NEON-NEXT:    fminnm s0, s0, s0
+; CHECK-NOFP16-NEON-NEXT:    ret
+;
+; CHECK-FP16-NEON-LABEL: fcanonicalize_f32_nnan:
+; CHECK-FP16-NEON:       // %bb.0:
+; CHECK-FP16-NEON-NEXT:    fminnm s0, s0, s0
+; CHECK-FP16-NEON-NEXT:    ret
+  %z = call nnan float @llvm.canonicalize.f32(float %x)
+  ret float %z
+}
+
+define <2 x float> @fcanonicalize_v2f32(<2 x float> %x) {
+; CHECK-NOFP16-NONEON-LABEL: fcanonicalize_v2f32:
+; CHECK-NOFP16-NONEON:       // %bb.0:
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s0, s0, s0
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s1, s1, s1
+; CHECK-NOFP16-NONEON-NEXT:    ret
+;
+; CHECK-FP16-NONEON-LABEL: fcanonicalize_v2f32:
+; CHECK-FP16-NONEON:       // %bb.0:
+; CHECK-FP16-NONEON-NEXT:    fminnm s0, s0, s0
+; CHECK-FP16-NONEON-NEXT:    fminnm s1, s1, s1
+; CHECK-FP16-NONEON-NEXT:    ret
+;
+; CHECK-NOFP16-NEON-LABEL: fcanonicalize_v2f32:
+; CHECK-NOFP16-NEON:       // %bb.0:
+; CHECK-NOFP16-NEON-NEXT:    fminnm v0.2s, v0.2s, v0.2s
+; CHECK-NOFP16-NEON-NEXT:    ret
+;
+; CHECK-FP16-NEON-LABEL: fcanonicalize_v2f32:
+; CHECK-FP16-NEON:       // %bb.0:
+; CHECK-FP16-NEON-NEXT:    fminnm v0.2s, v0.2s, v0.2s
+; CHECK-FP16-NEON-NEXT:    ret
+  %z = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> %x)
+  ret <2 x float> %z
+}
+
+define <2 x float> @fcanonicalize_v2f32_nnan(<2 x float> %x) {
+; CHECK-NOFP16-NONEON-LABEL: fcanonicalize_v2f32_nnan:
+; CHECK-NOFP16-NONEON:       // %bb.0:
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s0, s0, s0
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s1, s1, s1
+; CHECK-NOFP16-NONEON-NEXT:    ret
+;
+; CHECK-FP16-NONEON-LABEL: fcanonicalize_v2f32_nnan:
+; CHECK-FP16-NONEON:       // %bb.0:
+; CHECK-FP16-NONEON-NEXT:    fminnm s0, s0, s0
+; CHECK-FP16-NONEON-NEXT:    fminnm s1, s1, s1
+; CHECK-FP16-NONEON-NEXT:    ret
+;
+; CHECK-NOFP16-NEON-LABEL: fcanonicalize_v2f32_nnan:
+; CHECK-NOFP16-NEON:       // %bb.0:
+; CHECK-NOFP16-NEON-NEXT:    fminnm v0.2s, v0.2s, v0.2s
+; CHECK-NOFP16-NEON-NEXT:    ret
+;
+; CHECK-FP16-NEON-LABEL: fcanonicalize_v2f32_nnan:
+; CHECK-FP16-NEON:       // %bb.0:
+; CHECK-FP16-NEON-NEXT:    fminnm v0.2s, v0.2s, v0.2s
+; CHECK-FP16-NEON-NEXT:    ret
+  %z = call nnan <2 x float> @llvm.canonicalize.v2f32(<2 x float> %x)
+  ret <2 x float> %z
+}
+
+define <4 x float> @fcanonicalize_v4f32(<4 x float> %x) {
+; CHECK-NOFP16-NONEON-LABEL: fcanonicalize_v4f32:
+; CHECK-NOFP16-NONEON:       // %bb.0:
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s0, s0, s0
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s1, s1, s1
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s2, s2, s2
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s3, s3, s3
+; CHECK-NOFP16-NONEON-NEXT:    ret
+;
+; CHECK-FP16-NONEON-LABEL: fcanonicalize_v4f32:
+; CHECK-FP16-NONEON:       // %bb.0:
+; CHECK-FP16-NONEON-NEXT:    fminnm s0, s0, s0
+; CHECK-FP16-NONEON-NEXT:    fminnm s1, s1, s1
+; CHECK-FP16-NONEON-NEXT:    fminnm s2, s2, s2
+; CHECK-FP16-NONEON-NEXT:    fminnm s3, s3, s3
+; CHECK-FP16-NONEON-NEXT:    ret
+;
+; CHECK-NOFP16-NEON-LABEL: fcanonicalize_v4f32:
+; CHECK-NOFP16-NEON:       // %bb.0:
+; CHECK-NOFP16-NEON-NEXT:    fminnm v0.4s, v0.4s, v0.4s
+; CHECK-NOFP16-NEON-NEXT:    ret
+;
+; CHECK-FP16-NEON-LABEL: fcanonicalize_v4f32:
+; CHECK-FP16-NEON:       // %bb.0:
+; CHECK-FP16-NEON-NEXT:    fminnm v0.4s, v0.4s, v0.4s
+; CHECK-FP16-NEON-NEXT:    ret
+  %z = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> %x)
+  ret <4 x float> %z
+}
+
+define <4 x float> @fcanonicalize_v4f32_nnan(<4 x float> %x) {
+; CHECK-NOFP16-NONEON-LABEL: fcanonicalize_v4f32_nnan:
+; CHECK-NOFP16-NONEON:       // %bb.0:
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s0, s0, s0
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s1, s1, s1
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s2, s2, s2
+; CHECK-NOFP16-NONEON-NEXT:    fminnm s3, s3, s3
+; CHECK-NOFP16-NONEON-NEXT:    ret
+;
+; CHECK-FP16-NONEON-LABEL: fcanonicalize_v4f32_nnan:
+; CHECK-FP16-NONEON:       // %bb.0:
+; CHECK-FP16-NONEON-NEXT:    fminnm s0, s0, s0
+; CHECK-FP16-NONEON-NEXT:    fminnm s1, s1, s1
+; CHECK-FP16-NONEON-NEXT:    fminnm s2, s2, s2
+; CHECK-FP16-NONEON-NEXT:    fminnm s3, s3, s3
+; CHECK-FP16-NONEON-NEXT:    ret
+;
+; CHECK-NOFP16-NEON-LABEL: fcanonicalize_v4f32_nnan:
+; CHECK-NOFP16-NEON:       // %bb.0:
+; CHECK-NOFP16-NEON-NEXT:    fminnm v0.4s, v0.4s, v0.4s
+; CHECK-NOFP16-NEON-NEXT:    ret
+;
+; CHECK-FP16-NEON-LABEL: fcanonicalize_v4f32_nnan:
+; CHECK-FP16-NEON:       // %bb.0:
+; CHECK-FP16-NEON-NEXT:    fminnm v0.4s, v0.4s, v0.4s
+; CHECK-FP16-NEON-NEXT:    ret
+  %z = call nnan <4 x float> @llvm.canonicalize.v4f32(<4 x float> %x)
+  ret <4 x float> %z
+}
+
+define double @fcanonicalize_f64(double %x) {
+; CHECK-NOFP16-NONEON-LABEL: fcanonicalize_f64:
+; CHECK-NOFP16-NONEON:       // %bb.0:
+; CHECK-NOFP16-NONEON-NEXT:    fminnm d0, d0, d0
+; CHECK-NOFP16-NONEON-NEXT:    ret
+;
+; CHECK-FP16-NONEON-LABEL: fcanonicalize_f64:
+; CHECK-FP16-NONEON:       // %bb.0:
+; CHECK-FP16-NONEON-NEXT:    fminnm d0, d0, d0
+; CHECK-FP16-NONEON-NEXT:    ret
+;
+; CHECK-NOFP16-NEON-LABEL: fcanonicalize_f64:
+; CHECK-NOFP16-NEON:       // %bb.0:
+; CHECK-NOFP16-NEON-NEXT:    fminnm d0, d0, d0
+; CHECK-NOFP16-NEON-NEXT:    ret
+;
+; CHECK-FP16-NEON-LABEL: fcanonicalize_f64:
+; CHECK-FP16-NEON:       // %bb.0:
+; CHECK-FP16-NEON-NEXT:    fminnm d0, d0, d0
+; CHECK-FP16-NEON-NEXT:    ret
+  %z = call double @llvm.canonicalize.f64(double %x)
+  ret double %z
+}
+
+define double @fcanonicalize_f64_nnan(double %x) {
+; CHECK-NOFP16-NONEON-LABEL: fcanonicalize_f64_nnan:
+; CHECK-NOFP16-NONEON:       // %bb.0:
+; CHECK-NOFP16-NONEON-NEXT:    fminnm d0, d0, d0
+; CHECK-NOFP16-NONEON-NEXT:    ret
+;
+; CHECK-FP16-NONEON-LABEL: fcanonicalize_f64_nnan:
+; CHECK-FP16-NONEON:       // %bb.0:
+; CHECK-FP16-NONEON-NEXT:    fminnm d0, d0, d0
+; CHECK-FP16-NONEON-NEXT:    ret
+;
+; CHECK-NOFP16-NEON-LABEL: fcanonicalize_f64_nnan:
+; CHECK-NOFP16-NEON:       // %bb.0:
+; CHECK-NOFP16-NEON-NEXT:    fminnm d0, d0, d0
+; CHECK-NOFP16-NEON-NEXT:    ret
+;
+; CHECK-FP16-NEON-LABEL: fcanonicalize_f64_nnan:
+; CHECK-FP16-NEON:       // %bb.0:
+; CHECK-FP16-NEON-NEXT:    fminnm d0, d0, d0
+; CHECK-FP16-NEON-NEXT:    ret
+  %z = call nnan double @llvm.canonicalize.f64(double %x)
+  ret double %z
+}
+
+define <2 x double> @fcanonicalize_v2f64(<2 x double> %x) {
+; CHECK-NOFP16-NONEON-LABEL: fcanonicalize_v2f64:
+; CHECK-NOFP16-NONEON:       // %bb.0:
+; CHECK-NOFP16-NONEON-NEXT:    fminnm d0, d0, d0
+; CHECK-NOFP16-NONEON-NEXT:    fminnm d1, d1, d1
+; CHECK-NOFP16-NONEON-NEXT:    ret
+;
+; CHECK-FP16-NONEON-LABEL: fcanonicalize_v2f64:
+; CHECK-FP16-NONEON:       // %bb.0:
+; CHECK-FP16-NONEON-NEXT:    fminnm d0, d0, d0
+; CHECK-FP16-NONEON-NEXT:    fminnm d1, d1, d1
+; CHECK-FP16-NONEON-NEXT:    ret
+;
+; CHECK-NOFP16-NEON-LABEL: fcanonicalize_v2f64:
+; CHECK-NOFP16-NEON:       // %bb.0:
+; CHECK-NOFP16-NEON-NEXT:    fminnm v0.2d, v0.2d, v0.2d
+; CHECK-NOFP16-NEON-NEXT:    ret
+;
+; CHECK-FP16-NEON-LABEL: fcanonicalize_v2f64:
+; CHECK-FP16-NEON:       // %bb.0:
+; CHECK-FP16-NEON-NEXT:    fminnm v0.2d, v0.2d, v0.2d
+; CHECK-FP16-NEON-NEXT:    ret
+  %z = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> %x)
+  ret <2 x double> %z
+}
+
+define <2 x double> @fcanonicalize_v2f64_nnan(<2 x double> %x) {
+; CHECK-NOFP16-NONEON-LABEL: fcanonicalize_v2f64_nnan:
+; CHECK-NOFP16-NONEON:       // %bb.0:
+; CHECK-NOFP16-NONEON-NEXT:    fminnm d0, d0, d0
+; CHECK-NOFP16-NONEON-NEXT:    fminnm d1, d1, d1
+; CHECK-NOFP16-NONEON-NEXT:    ret
+;
+; CHECK-FP16-NONEON-LABEL: fcanonicalize_v2f64_nnan:
+; CHECK-FP16-NONEON:       // %bb.0:
+; CHECK-FP16-NONEON-NEXT:    fminnm d0, d0, d0
+; CHECK-FP16-NONEON-NEXT:    fminnm d1, d1, d1
+; CHECK-FP16-NONEON-NEXT:    ret
+;
+; CHECK-NOFP16-NEON-LABEL: fcanonicalize_v2f64_nnan:
+; CHECK-NOFP16-NEON:       // %bb.0:
+; CHECK-NOFP16-NEON-NEXT:    fminnm v0.2d, v0.2d, v0.2d
+; CHECK-NOFP16-NEON-NEXT:    ret
+;
+; CHECK-FP16-NEON-LABEL: fcanonicalize_v2f64_nnan:
+; CHECK-FP16-NEON:       // %bb.0:
+; CHECK-FP16-NEON-NEXT:    fminnm v0.2d, v0.2d, v0.2d
+; CHECK-FP16-NEON-NEXT:    ret
+  %z = call nnan <2 x double> @llvm.canonicalize.v2f64(<2 x double> %x)
+  ret <2 x double> %z
+}
diff --git a/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll b/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll
index b8406179f3cb32..bb3f9a3e52a16b 100644
--- a/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll
+++ b/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll
@@ -472,3 +472,563 @@ entry:
   %c = call nnan <16 x half> @llvm.minimumnum.v16f16(<16 x half> %a, <16 x half> %b)
   ret <16 x half> %c
 }
+
+;;;;;;;;;;;;;;;;  max_f64
+define double @max_f64(double %a, double %b) {
+; AARCH64-LABEL: max_f64:
+; AARCH64:       // %bb.0: // %entry
+; AARCH64-NEXT:    fminnm d1, d1, d1
+; AARCH64-NEXT:    fminnm d0, d0, d0
+; AARCH64-NEXT:    fmaxnm d0, d0, d1
+; AARCH64-NEXT:    ret
+entry:
+  %c = call double @llvm.maximumnum.f64(double %a, double %b)
+  ret double %c
+}
+
+define <2 x double> @max_v2f64(<2 x double> %a, <2 x double> %b) {
+; AARCH64-LABEL: max_v2f64:
+; AARCH64:       // %bb.0: // %entry
+; AARCH64-NEXT:    fminnm v1.2d, v1.2d, v1.2d
+; AARCH64-NEXT:    fminnm v0.2d, v0.2d, v0.2d
+; AARCH64-NEXT:    fmaxnm v0.2d, v0.2d, v1.2d
+; AARCH64-NEXT:    ret
+entry:
+  %c = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> %a, <2 x double> %b)
+  ret <2 x double> %c
+}
+
+define <3 x double> @max_v3f64(<3 x double> %a, <3 x double> %b) {
+; AARCH64-LABEL: max_v3f64:
+; AARCH64:       // %bb.0: // %entry
+; AARCH64-NEXT:    // kill: def $d3 killed $d3 def $q3
+; AARCH64-NEXT:    // kill: def $d0 killed $d0 def $q0
+; AARCH64-NEXT:    // kill: def $d4 killed $d4 def $q4
+; AARCH64-NEXT:    // kill: def $d1 killed $d1 def $q1
+; AARCH64-NEXT:    // kill: def $d2 killed $d2 def $q2
+; AARCH64-NEXT:    // kill: def $d5 killed $d5 def $q5
+; AARCH64-NEXT:    mov v0.d[1], v1.d[0]
+; AARCH64-NEXT:    mov v3.d[1], v4.d[0]
+; AARCH64-NEXT:    fminnm v2.2d, v2.2d, v2.2d
+; AARCH64-NEXT:    fminnm v1.2d, v3.2d, v3.2d
+; AARCH64-NEXT:    fminnm v0.2d, v0.2d, v0.2d
+; AARCH64-NEXT:    fmaxnm v0.2d, v0.2d, v1.2d
+; AARCH64-NEXT:    fminnm v1.2d, v5.2d, v5.2d
+; AARCH64-NEXT:    fmaxnm v2.2d, v2.2d, v1.2d
+; AARCH64-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; AARCH64-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; AARCH64-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; AARCH64-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; AARCH64-NEXT:    ret
+entry:
+  %c = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> %a, <3 x double> %b)
+  ret <3 x double> %c
+}
+
+define <4 x double> @max_v4f64(<4 x double> %a, <4 x double> %b) {
+; AARCH64-LABEL: max_v4f64:
+; AARCH64:       // %bb.0: // %entry
+; AARCH64-NEXT:    fminnm v2.2d, v2.2d, v2.2d
+; AARCH64-NEXT:    fminnm v0.2d, v0.2d, v0.2d
+; AARCH64-NEXT:    fminnm v3.2d, v3.2d, v3.2d
+; AARCH64-NEXT:    fminnm v1.2d, v1.2d, v1.2d
+; AARCH64-NEXT:    fmaxnm v0.2d, v0.2d, v2.2d
+; AARCH64-NEXT:    fmaxnm v1.2d, v1.2d, v3.2d
+; AARCH64-NEXT:    ret
+entry:
+  %c = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> %a, <4 x double> %b)
+  ret <4 x double> %c
+}
+
+;;;;;;;;;;;;;;;;;; max_f32
+define float @max_f32(float %a, float %b) {
+; AARCH64-LABEL: max_f32:
+; AARCH64:       // %bb.0: // %entry
+; AARCH64-NEXT:    fminnm s1, s1, s1
+; AARCH64-NEXT:    fminnm s0, s0, s0
+; AARCH64-NEXT:    fmaxnm s0, s0, s1
+; AARCH64-NEXT:    ret
+entry:
+  %c = call float @llvm.maximumnum.f32(float %a, float %b)
+  ret float %c
+}
+
+define <2 x float> @max_v2f32(<2 x float> %a, <2 x float> %b) {
+; AARCH64-LABEL: max_v2f32:
+; AARCH64:       // %bb.0: // %entry
+; AARCH64-NEXT:    fminnm v1.2s, v1.2s, v1.2s
+; AARCH64-NEXT:    fminnm v0.2s, v0.2s, v0.2s
+; AARCH64-NEXT:    fmaxnm v0.2s, v0.2s, v1.2s
+; AARCH64-NEXT:    ret
+entry:
+  %c = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> %a, <2 x float> %b)
+  ret <2 x float> %c
+}
+
+define <3 x float> @max_v3f32(<3 x float> %a, <3 x float> %b) {
+; AARCH64-LABEL: max_v3f32:
+; AARCH64:       // %bb.0: // %entry
+; AARCH64-NEXT:    fminnm v1.4s, v1.4s, v1.4s
+; AARCH64-NEXT:    fminnm v0.4s, v0.4s, v0.4s
+; AARCH64-NEXT:    fmaxnm v0.4s, v0.4s, v1.4s
+; AARCH64-NEXT:    ret
+entry:
+  %c = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> %a, <3 x float> %b)
+  ret <3 x float> %c
+}
+
+define <4 x float> @max_v4f32(<4 x float> %a, <4 x float> %b) {
+; AARCH64-LABEL: max_v4f32:
+; AARCH64:       // %bb.0: // %entry
+; AARCH64-NEXT:    fminnm v1.4s, v1.4s, v1.4s
+; AARCH64-NEXT:    fminnm v0.4s, v0.4s, v0.4s
+; AARCH64-NEXT:    fmaxnm v0.4s, v0.4s, v1.4s
+; AARCH64-NEXT:    ret
+entry:
+  %c = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %a, <4 x float> %b)
+  ret <4 x float> %c
+}
+
+define <5 x float> @max_v5f32(<5 x float> %a, <5 x float> %b) {
+; AARCH64-LABEL: max_v5f32:
+; AARCH64:       // %bb.0: // %entry
+; AARCH64-NEXT:    // kill: def $s0 killed $s0 def $q0
+; AARCH64-NEXT:    // kill: def $s5 killed $s5 def $q5
+; AARCH64-NEXT:    // kill: def $s1 killed $s1 def $q1
+; AARCH64-NEXT:    // kill: def $s6 killed $s6 def $q6
+; AARCH64-NEXT:    // kill: def $s2 killed $s2 def $q2
+; AARCH64-NEXT:    // kill: def $s7 killed $s7 def $q7
+; AARCH64-NEXT:    // kill: def $s3 killed $s3 def $q3
+; AARCH64-NEXT:    mov x8, sp
+; AARCH64-NEXT:    // kill: def $s4 killed $s4 def $q4
+; AARCH64-NEXT:    mov v0.s[1], v1.s[0]
+; AARCH64-NEXT:    mov v5.s[1], v6.s[0]
+; AARCH64-NEXT:    mov v0.s[2], v2.s[0]
+; AARCH64-NEXT:    mov v5.s[2], v7.s[0]
+; AARCH64-NEXT:    ldr s2, [sp, #8]
+; AARCH64-NEXT:    fminnm v2.4s, v2.4s, v2.4s
+; AARCH64-NEXT:    mov v0.s[3], v3.s[0]
+; AARCH64-NEXT:    ld1 { v5.s }[3], [x8]
+; AARCH64-NEXT:    fminnm v3.4s, v4.4s, v4.4s
+; AARCH64-NEXT:    fminnm v1.4s, v5.4s, v5.4s
+; AARCH64-NEXT:    fminnm v0.4s, v0.4s, v0.4s
+; AARCH64-NEXT:    fmaxnm v4.4s, v3.4s, v2.4s
+; AARCH64-NEXT:    // kill: def $s4 killed $s4 killed $q4
+; AARCH64-NEXT:    fmaxnm v0.4s, v0.4s, v1.4s
+; AARCH64-NEXT:    mov s1, v0.s[1]
+; AARCH64-NEXT:    mov s2, v0.s[2]
+; AARCH64-NEXT:    mov s3, v0.s[3]
+; AARCH64-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; AARCH64-NEXT:    ret
+entry:
+  %c = call <5 x float> @llvm.maximumnum.v5f32(<5 x float> %a, <5 x float> %b)
+  ret <5 x float> %c
+}
+
+define <8 x float> @max_v8f32(<8 x float> %a, <8 x float> %b) {
+; AARCH64-LABEL: max_v8f32:
+; AARCH64:       // %bb.0: // %entry
+; AARCH64-NEXT:    fminnm v2.4s, v2.4s, v2.4s
+; AARCH64-NEXT:    fminnm v0.4s, v0.4s, v0.4s
+; AARCH64-NEXT:    fminnm v3.4s, v3.4s, v3.4s
+; AARCH64-NEXT:    fminnm v1.4s, v1.4s, v1.4s
+; AARCH64-NEXT:    fmaxnm v0.4s, v0.4s, v2.4s
+; AARCH64-NEXT:    fmaxnm v1.4s, v1.4s, v3.4s
+; AARCH64-NEXT:    ret
+entry:
+  %c = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> %a, <8 x float> %b)
+  ret <8 x float> %c
+}
+
+;;;;;;;;;;;;;;;;;; max_f16
+define half @max_f16(half %a, half %b) {
+; AARCH64-LABEL: max_f16:
+; AARCH64:       // %bb.0: // %entry
+; AARCH64-NEXT:    fminnm h1, h1, h1
+; AARCH64-NEXT:    fminnm h0, h0, h0
+; AARCH64-NEXT:    fmaxnm h0, h0, h1
+; AARCH64-NEXT:    ret
+entry:
+  %c = call half @llvm.maximumnum.f16(half %a, half %b)
+  ret half %c
+}
+
+define <2 x half> @max_v2f16(<2 x half> %a, <2 x half> %b) {
+; AARCH64-LABEL: max_v2f16:
+; AARCH64:       // %bb.0: // %entry
+; AARCH64-NEXT:    fminnm v1.4h, v1.4h, v1.4h
+; AARCH64-NEXT:    fminnm v0.4h, v0.4h, v0.4h
+; AARCH64-NEXT:    fmaxnm v0.4h, v0.4h, v1.4h
+; AARCH64-NEXT:    ret
+entry:
+  %c = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> %a, <2 x half> %b)
+  ret <2 x half> %c
+}
+
+define <4 x half> @max_v4f16(<4 x half> %a, <4 x half> %b) {
+; AARCH64-LABEL: max_v4f16:
+; AARCH64:       // %bb.0: // %entry
+; AARCH64-NEXT:    fminnm v1.4h, v1.4h, v1.4h
+; AARCH64-NEXT:    fminnm v0.4h, v0.4h, v0.4h
+; AARCH64-NEXT:    fmaxnm v0.4h, v0.4h, v1.4h
+; AARCH64-NEXT:    ret
+entry:
+  %c = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> %a, <4 x half> %b)
+  ret <4 x half> %c
+}
+
+define <8 x half> @max_v8f16(<8 x half> %a, <8 x half> %b) {
+; AARCH64-LABEL: max_v8f16:
+; AARCH64:       // %bb.0: // %entry
+; AARCH64-NEXT:    fminnm v1.8h, v1.8h, v1.8h
+; AARCH64-NEXT:    fminnm v0.8h, v0.8h, v0.8h
+; AARCH64-NEXT:    fmaxnm v0.8h, v0.8h, v1.8h
+; AARCH64-NEXT:    ret
+entry:
+  %c = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> %a, <8 x half> %b)
+  ret <8 x half> %c
+}
+
+define <9 x half> @max_v9f16(<9 x half> %a, <9 x half> %b) {
+; AARCH64-LABEL: max_v9f16:
+; AARCH64:       // %bb.0: // %entry
+; AARCH64-NEXT:    // kill: def $h0 killed $h0 def $q0
+; AARCH64-NEXT:    // kill: def $h1 killed $h1 def $q1
+; AARCH64-NEXT:    // kill: def $h2 killed $h2 def $q2
+; AARCH64-NEXT:    add x9, sp, #16
+; AARCH64-NEXT:    // kill: def $h3 killed $h3 def $q3
+; AARCH64-NEXT:    // kill: def $h4 killed $h4 def $q4
+; AARCH64-NEXT:    // kill: def $h5 killed $h5 def $q5
+; AARCH64-NEXT:    // kill: def $h6 killed $h6 def $q6
+; AARCH64-NEXT:    // kill: def $h7 killed $h7 def $q7
+; AARCH64-NEXT:    mov v0.h[1], v1.h[0]
+; AARCH64-NEXT:    ldr h1, [sp, #8]
+; AARCH64-NEXT:    ld1 { v1.h }[1], [x9]
+; AARCH64-NEXT:    add x9, sp, #24
+; AARCH64-NEXT:    mov v0.h[2], v2.h[0]
+; AARCH64-NEXT:    ldr h2, [sp]
+; AARCH64-NEXT:    ld1 { v1.h }[2], [x9]
+; AARCH64-NEXT:    add x9, sp, #32
+; AARCH64-NEXT:    fminnm v2.8h, v2.8h, v2.8h
+; AARCH64-NEXT:    mov v0.h[3], v3.h[0]
+; AARCH64-NEXT:    ld1 { v1.h }[3], [x9]
+; AARCH64-NEXT:    add x9, sp, #40
+; AARCH64-NEXT:    ldr h3, [sp, #72]
+; AARCH64-NEXT:    ld1 { v1.h }[4], [x9]
+; AARCH64-NEXT:    add x9, sp, #48
+; AARCH64-NEXT:    fminnm v3.8h, v3.8h, v3.8h
+; AARCH64-NEXT:    mov v0.h[4], v4.h[0]
+; AARCH64-NEXT:    ld1 { v1.h }[5], [x9]
+; AARCH64-NEXT:    add x9, sp, #56
+; AARCH64-NEXT:    fmaxnm v2.8h, v2.8h, v3.8h
+; AARCH64-NEXT:    mov v0.h[5], v5.h[0]
+; AARCH64-NEXT:    ld1 { v1.h }[6], [x9]
+; AARCH64-NEXT:    add x9, sp, #64
+; AARCH64-NEXT:    str h2, [x8, #16]
+; AARCH64-NEXT:    mov v0.h[6], v6.h[0]
+; AARCH64-NEXT:    ld1 { v1.h }[7], [x9]
+; AARCH64-NEXT:    fminnm v1.8h, v1.8h, v1.8h
+; AARCH64-NEXT:    mov v0.h[7], v7.h[0]
+; AARCH64-NEXT:    fminnm v0.8h, v0.8h, v0.8h
+; AARCH64-NEXT:    fmaxnm v0.8h, v0.8h, v1.8h
+; AARCH64-NEXT:    str q0, [x8]
+; AARCH64-NEXT:    ret
+entry:
+  %c = call <9 x half> @llvm.maximumnum.v9f16(<9 x half> %a, <9 x half> %b)
+  ret <9 x half> %c
+}
+
+define <16 x half> @max_v16f16(<16 x half> %a, <16 x half> %b) {
+; AARCH64-LABEL: max_v16f16:
+; AARCH64:       // %bb.0: // %entry
+; AARCH64-NEXT:    fminnm v2.8h, v2.8h, v2.8h
+; AARCH64-NEXT:    fminnm v0.8h, v0.8h, v0.8h
+; AARCH64-NEXT:    fminnm v3.8h, v3.8h, v3.8h
+; AARCH64-NEXT:    fminnm v1.8h, v1.8h, v1.8h
+; AARCH64-NEXT:    fmaxnm v0.8h, v0.8h, v2.8h
+; AARCH64-NEXT:    fmaxnm v1.8h, v1.8h, v3.8h
+; AARCH64-NEXT:    ret
+entry:
+  %c = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> %a, <16 x half> %b)
+  ret <16 x half> %c
+}
+
+;;;;;;;;;;;;;;;;  min_f64
+define double @min_f64(double %a, double %b) {
+; AARCH64-LABEL: min_f64:
+; AARCH64:       // %bb.0: // %entry
+; AARCH64-NEXT:    fminnm d1, d1, d1
+; AARCH64-NEXT:    fminnm d0, d0, d0
+; AARCH64-NEXT:    fminnm d0, d0, d1
+; AARCH64-NEXT:    ret
+entry:
+  %c = call double @llvm.minimumnum.f64(double %a, double %b)
+  ret double %c
+}
+
+define <2 x double> @min_v2f64(<2 x double> %a, <2 x double> %b) {
+; AARCH64-LABEL: min_v2f64:
+; AARCH64:       // %bb.0: // %entry
+; AARCH64-NEXT:    fminnm v1.2d, v1.2d, v1.2d
+; AARCH64-NEXT:    fminnm v0.2d, v0.2d, v0.2d
+; AARCH64-NEXT:    fminnm v0.2d, v0.2d, v1.2d
+; AARCH64-NEXT:    ret
+entry:
+  %c = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %a, <2 x double> %b)
+  ret <2 x double> %c
+}
+
+define <3 x double> @min_v3f64(<3 x double> %a, <3 x double> %b) {
+; AARCH64-LABEL: min_v3f64:
+; AARCH64:       // %bb.0: // %entry
+; AARCH64-NEXT:    // kill: def $d3 killed $d3 def $q3
+; AARCH64-NEXT:    // kill: def $d0 killed $d0 def $q0
+; AARCH64-NEXT:    // kill: def $d4 killed $d4 def $q4
+; AARCH64-NEXT:    // kill: def $d1 killed $d1 def $q1
+; AARCH64-NEXT:    // kill: def $d2 killed $d2 def $q2
+; AARCH64-NEXT:    // kill: def $d5 killed $d5 def $q5
+; AARCH64-NEXT:    mov v0.d[1], v1.d[0]
+; AARCH64-NEXT:    mov v3.d[1], v4.d[0]
+; AARCH64-NEXT:    fminnm v2.2d, v2.2d, v2.2d
+; AARCH64-NEXT:    fminnm v1.2d, v3.2d, v3.2d
+; AARCH64-NEXT:    fminnm v0.2d, v0.2d, v0.2d
+; AARCH64-NEXT:    fminnm v0.2d, v0.2d, v1.2d
+; AARCH64-NEXT:    fminnm v1.2d, v5.2d, v5.2d
+; AARCH64-NEXT:    fminnm v2.2d, v2.2d, v1.2d
+; AARCH64-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; AARCH64-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; AARCH64-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; AARCH64-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; AARCH64-NEXT:    ret
+entry:
+  %c = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> %a, <3 x double> %b)
+  ret <3 x double> %c
+}
+
+define <4 x double> @min_v4f64(<4 x double> %a, <4 x double> %b) {
+; AARCH64-LABEL: min_v4f64:
+; AARCH64:       // %bb.0: // %entry
+; AARCH64-NEXT:    fminnm v2.2d, v2.2d, v2.2d
+; AARCH64-NEXT:    fminnm v0.2d, v0.2d, v0.2d
+; AARCH64-NEXT:    fminnm v3.2d, v3.2d, v3.2d
+; AARCH64-NEXT:    fminnm v1.2d, v1.2d, v1.2d
+; AARCH64-NEXT:    fminnm v0.2d, v0.2d, v2.2d
+; AARCH64-NEXT:    fminnm v1.2d, v1.2d, v3.2d
+; AARCH64-NEXT:    ret
+entry:
+  %c = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> %a, <4 x double> %b)
+  ret <4 x double> %c
+}
+
+;;;;;;;;;;;;;;;;;; min_f32
+define float @min_f32(float %a, float %b) {
+; AARCH64-LABEL: min_f32:
+; AARCH64:       // %bb.0: // %entry
+; AARCH64-NEXT:    fminnm s1, s1, s1
+; AARCH64-NEXT:    fminnm s0, s0, s0
+; AARCH64-NEXT:    fminnm s0, s0, s1
+; AARCH64-NEXT:    ret
+entry:
+  %c = call float @llvm.minimumnum.f32(float %a, float %b)
+  ret float %c
+}
+
+define <2 x float> @min_v2f32(<2 x float> %a, <2 x float> %b) {
+; AARCH64-LABEL: min_v2f32:
+; AARCH64:       // %bb.0: // %entry
+; AARCH64-NEXT:    fminnm v1.2s, v1.2s, v1.2s
+; AARCH64-NEXT:    fminnm v0.2s, v0.2s, v0.2s
+; AARCH64-NEXT:    fminnm v0.2s, v0.2s, v1.2s
+; AARCH64-NEXT:    ret
+entry:
+  %c = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> %a, <2 x float> %b)
+  ret <2 x float> %c
+}
+
+define <3 x float> @min_v3f32(<3 x float> %a, <3 x float> %b) {
+; AARCH64-LABEL: min_v3f32:
+; AARCH64:       // %bb.0: // %entry
+; AARCH64-NEXT:    fminnm v1.4s, v1.4s, v1.4s
+; AARCH64-NEXT:    fminnm v0.4s, v0.4s, v0.4s
+; AARCH64-NEXT:    fminnm v0.4s, v0.4s, v1.4s
+; AARCH64-NEXT:    ret
+entry:
+  %c = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> %a, <3 x float> %b)
+  ret <3 x float> %c
+}
+
+define <4 x float> @min_v4f32(<4 x float> %a, <4 x float> %b) {
+; AARCH64-LABEL: min_v4f32:
+; AARCH64:       // %bb.0: // %entry
+; AARCH64-NEXT:    fminnm v1.4s, v1.4s, v1.4s
+; AARCH64-NEXT:    fminnm v0.4s, v0.4s, v0.4s
+; AARCH64-NEXT:    fminnm v0.4s, v0.4s, v1.4s
+; AARCH64-NEXT:    ret
+entry:
+  %c = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> %a, <4 x float> %b)
+  ret <4 x float> %c
+}
+
+define <5 x float> @min_v5f32(<5 x float> %a, <5 x float> %b) {
+; AARCH64-LABEL: min_v5f32:
+; AARCH64:       // %bb.0: // %entry
+; AARCH64-NEXT:    // kill: def $s0 killed $s0 def $q0
+; AARCH64-NEXT:    // kill: def $s5 killed $s5 def $q5
+; AARCH64-NEXT:    // kill: def $s1 killed $s1 def $q1
+; AARCH64-NEXT:    // kill: def $s6 killed $s6 def $q6
+; AARCH64-NEXT:    // kill: def $s2 killed $s2 def $q2
+; AARCH64-NEXT:    // kill: def $s7 killed $s7 def $q7
+; AARCH64-NEXT:    // kill: def $s3 killed $s3 def $q3
+; AARCH64-NEXT:    mov x8, sp
+; AARCH64-NEXT:    // kill: def $s4 killed $s4 def $q4
+; AARCH64-NEXT:    mov v0.s[1], v1.s[0]
+; AARCH64-NEXT:    mov v5.s[1], v6.s[0]
+; AARCH64-NEXT:    mov v0.s[2], v2.s[0]
+; AARCH64-NEXT:    mov v5.s[2], v7.s[0]
+; AARCH64-NEXT:    ldr s2, [sp, #8]
+; AARCH64-NEXT:    fminnm v2.4s, v2.4s, v2.4s
+; AARCH64-NEXT:    mov v0.s[3], v3.s[0]
+; AARCH64-NEXT:    ld1 { v5.s }[3], [x8]
+; AARCH64-NEXT:    fminnm v3.4s, v4.4s, v4.4s
+; AARCH64-NEXT:    fminnm v1.4s, v5.4s, v5.4s
+; AARCH64-NEXT:    fminnm v0.4s, v0.4s, v0.4s
+; AARCH64-NEXT:    fminnm v4.4s, v3.4s, v2.4s
+; AARCH64-NEXT:    // kill: def $s4 killed $s4 killed $q4
+; AARCH64-NEXT:    fminnm v0.4s, v0.4s, v1.4s
+; AARCH64-NEXT:    mov s1, v0.s[1]
+; AARCH64-NEXT:    mov s2, v0.s[2]
+; AARCH64-NEXT:    mov s3, v0.s[3]
+; AARCH64-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; AARCH64-NEXT:    ret
+entry:
+  %c = call <5 x float> @llvm.minimumnum.v5f32(<5 x float> %a, <5 x float> %b)
+  ret <5 x float> %c
+}
+
+define <8 x float> @min_v8f32(<8 x float> %a, <8 x float> %b) {
+; AARCH64-LABEL: min_v8f32:
+; AARCH64:       // %bb.0: // %entry
+; AARCH64-NEXT:    fminnm v2.4s, v2.4s, v2.4s
+; AARCH64-NEXT:    fminnm v0.4s, v0.4s, v0.4s
+; AARCH64-NEXT:    fminnm v3.4s, v3.4s, v3.4s
+; AARCH64-NEXT:    fminnm v1.4s, v1.4s, v1.4s
+; AARCH64-NEXT:    fminnm v0.4s, v0.4s, v2.4s
+; AARCH64-NEXT:    fminnm v1.4s, v1.4s, v3.4s
+; AARCH64-NEXT:    ret
+entry:
+  %c = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> %a, <8 x float> %b)
+  ret <8 x float> %c
+}
+
+;;;;;;;;;;;;;;;;;; min_f16
+define half @min_f16(half %a, half %b) {
+; AARCH64-LABEL: min_f16:
+; AARCH64:       // %bb.0: // %entry
+; AARCH64-NEXT:    fminnm h1, h1, h1
+; AARCH64-NEXT:    fminnm h0, h0, h0
+; AARCH64-NEXT:    fminnm h0, h0, h1
+; AARCH64-NEXT:    ret
+entry:
+  %c = call half @llvm.minimumnum.f16(half %a, half %b)
+  ret half %c
+}
+
+define <2 x half> @min_v2f16(<2 x half> %a, <2 x half> %b) {
+; AARCH64-LABEL: min_v2f16:
+; AARCH64:       // %bb.0: // %entry
+; AARCH64-NEXT:    fminnm v1.4h, v1.4h, v1.4h
+; AARCH64-NEXT:    fminnm v0.4h, v0.4h, v0.4h
+; AARCH64-NEXT:    fminnm v0.4h, v0.4h, v1.4h
+; AARCH64-NEXT:    ret
+entry:
+  %c = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> %a, <2 x half> %b)
+  ret <2 x half> %c
+}
+
+define <4 x half> @min_v4f16(<4 x half> %a, <4 x half> %b) {
+; AARCH64-LABEL: min_v4f16:
+; AARCH64:       // %bb.0: // %entry
+; AARCH64-NEXT:    fminnm v1.4h, v1.4h, v1.4h
+; AARCH64-NEXT:    fminnm v0.4h, v0.4h, v0.4h
+; AARCH64-NEXT:    fminnm v0.4h, v0.4h, v1.4h
+; AARCH64-NEXT:    ret
+entry:
+  %c = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> %a, <4 x half> %b)
+  ret <4 x half> %c
+}
+
+define <8 x half> @min_v8f16(<8 x half> %a, <8 x half> %b) {
+; AARCH64-LABEL: min_v8f16:
+; AARCH64:       // %bb.0: // %entry
+; AARCH64-NEXT:    fminnm v1.8h, v1.8h, v1.8h
+; AARCH64-NEXT:    fminnm v0.8h, v0.8h, v0.8h
+; AARCH64-NEXT:    fminnm v0.8h, v0.8h, v1.8h
+; AARCH64-NEXT:    ret
+entry:
+  %c = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> %a, <8 x half> %b)
+  ret <8 x half> %c
+}
+
+define <9 x half> @min_v9f16(<9 x half> %a, <9 x half> %b) {
+; AARCH64-LABEL: min_v9f16:
+; AARCH64:       // %bb.0: // %entry
+; AARCH64-NEXT:    // kill: def $h0 killed $h0 def $q0
+; AARCH64-NEXT:    // kill: def $h1 killed $h1 def $q1
+; AARCH64-NEXT:    // kill: def $h2 killed $h2 def $q2
+; AARCH64-NEXT:    add x9, sp, #16
+; AARCH64-NEXT:    // kill: def $h3 killed $h3 def $q3
+; AARCH64-NEXT:    // kill: def $h4 killed $h4 def $q4
+; AARCH64-NEXT:    // kill: def $h5 killed $h5 def $q5
+; AARCH64-NEXT:    // kill: def $h6 killed $h6 def $q6
+; AARCH64-NEXT:    // kill: def $h7 killed $h7 def $q7
+; AARCH64-NEXT:    mov v0.h[1], v1.h[0]
+; AARCH64-NEXT:    ldr h1, [sp, #8]
+; AARCH64-NEXT:    ld1 { v1.h }[1], [x9]
+; AARCH64-NEXT:    add x9, sp, #24
+; AARCH64-NEXT:    mov v0.h[2], v2.h[0]
+; AARCH64-NEXT:    ldr h2, [sp]
+; AARCH64-NEXT:    ld1 { v1.h }[2], [x9]
+; AARCH64-NEXT:    add x9, sp, #32
+; AARCH64-NEXT:    fminnm v2.8h, v2.8h, v2.8h
+; AARCH64-NEXT:    mov v0.h[3], v3.h[0]
+; AARCH64-NEXT:    ld1 { v1.h }[3], [x9]
+; AARCH64-NEXT:    add x9, sp, #40
+; AARCH64-NEXT:    ldr h3, [sp, #72]
+; AARCH64-NEXT:    ld1 { v1.h }[4], [x9]
+; AARCH64-NEXT:    add x9, sp, #48
+; AARCH64-NEXT:    fminnm v3.8h, v3.8h, v3.8h
+; AARCH64-NEXT:    mov v0.h[4], v4.h[0]
+; AARCH64-NEXT:    ld1 { v1.h }[5], [x9]
+; AARCH64-NEXT:    add x9, sp, #56
+; AARCH64-NEXT:    fminnm v2.8h, v2.8h, v3.8h
+; AARCH64-NEXT:    mov v0.h[5], v5.h[0]
+; AARCH64-NEXT:    ld1 { v1.h }[6], [x9]
+; AARCH64-NEXT:    add x9, sp, #64
+; AARCH64-NEXT:    str h2, [x8, #16]
+; AARCH64-NEXT:    mov v0.h[6], v6.h[0]
+; AARCH64-NEXT:    ld1 { v1.h }[7], [x9]
+; AARCH64-NEXT:    fminnm v1.8h, v1.8h, v1.8h
+; AARCH64-NEXT:    mov v0.h[7], v7.h[0]
+; AARCH64-NEXT:    fminnm v0.8h, v0.8h, v0.8h
+; AARCH64-NEXT:    fminnm v0.8h, v0.8h, v1.8h
+; AARCH64-NEXT:    str q0, [x8]
+; AARCH64-NEXT:    ret
+entry:
+  %c = call <9 x half> @llvm.minimumnum.v9f16(<9 x half> %a, <9 x half> %b)
+  ret <9 x half> %c
+}
+
+define <16 x half> @min_v16f16(<16 x half> %a, <16 x half> %b) {
+; AARCH64-LABEL: min_v16f16:
+; AARCH64:       // %bb.0: // %entry
+; AARCH64-NEXT:    fminnm v2.8h, v2.8h, v2.8h
+; AARCH64-NEXT:    fminnm v0.8h, v0.8h, v0.8h
+; AARCH64-NEXT:    fminnm v3.8h, v3.8h, v3.8h
+; AARCH64-NEXT:    fminnm v1.8h, v1.8h, v1.8h
+; AARCH64-NEXT:    fminnm v0.8h, v0.8h, v2.8h
+; AARCH64-NEXT:    fminnm v1.8h, v1.8h, v3.8h
+; AARCH64-NEXT:    ret
+entry:
+  %c = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> %a, <16 x half> %b)
+  ret <16 x half> %c
+}

From 6c398abb75da5413152f97a780ddb3b3b2b6a0b7 Mon Sep 17 00:00:00 2001
From: Caio Oliveira <cmarcelo@gmail.com>
Date: Thu, 10 Oct 2024 18:08:18 -0700
Subject: [PATCH 107/177] [NFC][mlir][spirv] Fix syntax warnings in
 gen_spirv_dialect.py (#111775)

In the context of regular expressions, Python (used to) gracefully
ignore the escape behavior of `\` in some contexts, e.g. for
representing the regular expression `\w+`. However in newer versions of
Python this now gives a warning in the form

```
SyntaxWarning: invalid escape sequence '\w'
```

Fix by explicitly using raw strings instead.
---
 mlir/utils/spirv/gen_spirv_dialect.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mlir/utils/spirv/gen_spirv_dialect.py b/mlir/utils/spirv/gen_spirv_dialect.py
index 78c1022428d8a1..6d82c012158196 100755
--- a/mlir/utils/spirv/gen_spirv_dialect.py
+++ b/mlir/utils/spirv/gen_spirv_dialect.py
@@ -538,7 +538,7 @@ def gen_instr_coverage_report(path, instructions):
 
     prefix = "def SPIRV_OC_"
     existing_opcodes = [
-        k[len(prefix) :] for k in re.findall(prefix + "\w+", content[1])
+        k[len(prefix) :] for k in re.findall(prefix + r"\w+", content[1])
     ]
     existing_instructions = list(
         filter(lambda inst: (inst["opname"] in existing_opcodes), instructions)
@@ -597,7 +597,7 @@ def update_td_opcodes(path, instructions, filter_list):
     # Extend opcode list with existing list
     prefix = "def SPIRV_OC_"
     existing_opcodes = [
-        k[len(prefix) :] for k in re.findall(prefix + "\w+", content[1])
+        k[len(prefix) :] for k in re.findall(prefix + r"\w+", content[1])
     ]
     filter_list.extend(existing_opcodes)
     filter_list = list(set(filter_list))
@@ -644,7 +644,7 @@ def update_td_enum_attrs(path, operand_kinds, filter_list):
     suffix = "Attr"
     existing_kinds = [
         k[len(prefix) : -len(suffix)]
-        for k in re.findall(prefix + "\w+" + suffix, content[1])
+        for k in re.findall(prefix + r"\w+" + suffix, content[1])
     ]
     filter_list.extend(existing_kinds)
 
@@ -971,7 +971,7 @@ def extract_td_op_info(op_def):
     suffix = "Op"
     opname = [
         o[len(prefix) : -len(suffix)]
-        for o in re.findall(prefix + "\w+" + suffix, op_def)
+        for o in re.findall(prefix + r"\w+" + suffix, op_def)
     ]
     assert len(opname) == 1, "more than one ops in the same section!"
     opname = opname[0]
@@ -979,7 +979,7 @@ def extract_td_op_info(op_def):
     # Get instruction category
     prefix = "SPIRV_"
     inst_category = [
-        o[len(prefix) :] for o in re.findall(prefix + "\w+Op", op_def.split(":", 1)[1])
+        o[len(prefix) :] for o in re.findall(prefix + r"\w+Op", op_def.split(":", 1)[1])
     ]
     assert len(inst_category) <= 1, "more than one ops in the same section!"
     inst_category = inst_category[0] if len(inst_category) == 1 else "Op"

From e3894f58e1a534c57f53b3beb21d6b2f0d3382b2 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Fri, 11 Oct 2024 09:08:38 +0800
Subject: [PATCH 108/177] [CodeGenPrepare] Convert `ctpop(X) ==/!= 1` into
 `ctpop(X) u</u> 2/1` (#111284)

Some targets have better codegen for `ctpop(X) u< 2` than `ctpop(X) ==
1`. After https://github.com/llvm/llvm-project/pull/100899, we set the
range of ctpop's return value to indicate the argument/result is
non-zero.

This patch converts `ctpop(X) ==/!= 1` into `ctpop(X) u</u> 2/1` in CGP
to fix https://github.com/llvm/llvm-project/issues/95255.
---
 llvm/lib/CodeGen/CodeGenPrepare.cpp       | 28 ++++++++
 llvm/test/CodeGen/AArch64/arm64-popcnt.ll | 68 +++++++++++++++++--
 llvm/test/CodeGen/RISCV/rv32zbb.ll        | 39 +++++++++++
 llvm/test/CodeGen/RISCV/rv64zbb.ll        | 81 +++++++++++++++++++++++
 llvm/test/CodeGen/X86/ispow2.ll           | 45 ++++++++++++-
 llvm/test/CodeGen/X86/known-never-zero.ll | 12 ++--
 6 files changed, 258 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 3e09fbad6ab198..86f28293ba9ff8 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -2111,6 +2111,31 @@ bool CodeGenPrepare::optimizeURem(Instruction *Rem) {
   return false;
 }
 
+/// Some targets have better codegen for `ctpop(X) u< 2` than `ctpop(X) == 1`.
+/// This function converts `ctpop(X) ==/!= 1` into `ctpop(X) u</u> 2/1` if the
+/// result cannot be zero.
+static bool adjustIsPower2Test(CmpInst *Cmp, const TargetLowering &TLI,
+                               const TargetTransformInfo &TTI,
+                               const DataLayout &DL) {
+  ICmpInst::Predicate Pred;
+  if (!match(Cmp, m_ICmp(Pred, m_Intrinsic<Intrinsic::ctpop>(), m_One())))
+    return false;
+  if (!ICmpInst::isEquality(Pred))
+    return false;
+  auto *II = cast<IntrinsicInst>(Cmp->getOperand(0));
+
+  if (isKnownNonZero(II, DL)) {
+    if (Pred == ICmpInst::ICMP_EQ) {
+      Cmp->setOperand(1, ConstantInt::get(II->getType(), 2));
+      Cmp->setPredicate(ICmpInst::ICMP_ULT);
+    } else {
+      Cmp->setPredicate(ICmpInst::ICMP_UGT);
+    }
+    return true;
+  }
+  return false;
+}
+
 bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
   if (sinkCmpExpression(Cmp, *TLI))
     return true;
@@ -2130,6 +2155,9 @@ bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
   if (foldFCmpToFPClassTest(Cmp, *TLI, *DL))
     return true;
 
+  if (adjustIsPower2Test(Cmp, *TLI, *TTI, *DL))
+    return true;
+
   return false;
 }
 
diff --git a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
index f5ce73a366125b..0030e9ce80abb4 100644
--- a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
@@ -15,7 +15,7 @@ define i32 @cnt32_advsimd(i32 %x) nounwind readnone {
 ; CHECK-NONEON-LABEL: cnt32_advsimd:
 ; CHECK-NONEON:       // %bb.0:
 ; CHECK-NONEON-NEXT:    lsr w9, w0, #1
-; CHECK-NONEON-NEXT:    mov w8, #16843009
+; CHECK-NONEON-NEXT:    mov w8, #16843009 // =0x1010101
 ; CHECK-NONEON-NEXT:    and w9, w9, #0x55555555
 ; CHECK-NONEON-NEXT:    sub w9, w0, w9
 ; CHECK-NONEON-NEXT:    lsr w10, w9, #2
@@ -50,7 +50,7 @@ define i32 @cnt32_advsimd_2(<2 x i32> %x) {
 ; CHECK-NONEON-LABEL: cnt32_advsimd_2:
 ; CHECK-NONEON:       // %bb.0:
 ; CHECK-NONEON-NEXT:    lsr w9, w0, #1
-; CHECK-NONEON-NEXT:    mov w8, #16843009
+; CHECK-NONEON-NEXT:    mov w8, #16843009 // =0x1010101
 ; CHECK-NONEON-NEXT:    and w9, w9, #0x55555555
 ; CHECK-NONEON-NEXT:    sub w9, w0, w9
 ; CHECK-NONEON-NEXT:    lsr w10, w9, #2
@@ -86,7 +86,7 @@ define i64 @cnt64_advsimd(i64 %x) nounwind readnone {
 ; CHECK-NONEON-LABEL: cnt64_advsimd:
 ; CHECK-NONEON:       // %bb.0:
 ; CHECK-NONEON-NEXT:    lsr x9, x0, #1
-; CHECK-NONEON-NEXT:    mov x8, #72340172838076673
+; CHECK-NONEON-NEXT:    mov x8, #72340172838076673 // =0x101010101010101
 ; CHECK-NONEON-NEXT:    and x9, x9, #0x5555555555555555
 ; CHECK-NONEON-NEXT:    sub x9, x0, x9
 ; CHECK-NONEON-NEXT:    lsr x10, x9, #2
@@ -114,7 +114,7 @@ define i32 @cnt32(i32 %x) nounwind readnone noimplicitfloat {
 ; CHECK-LABEL: cnt32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    lsr w9, w0, #1
-; CHECK-NEXT:    mov w8, #16843009
+; CHECK-NEXT:    mov w8, #16843009 // =0x1010101
 ; CHECK-NEXT:    and w9, w9, #0x55555555
 ; CHECK-NEXT:    sub w9, w0, w9
 ; CHECK-NEXT:    lsr w10, w9, #2
@@ -130,7 +130,7 @@ define i32 @cnt32(i32 %x) nounwind readnone noimplicitfloat {
 ; CHECK-NONEON-LABEL: cnt32:
 ; CHECK-NONEON:       // %bb.0:
 ; CHECK-NONEON-NEXT:    lsr w9, w0, #1
-; CHECK-NONEON-NEXT:    mov w8, #16843009
+; CHECK-NONEON-NEXT:    mov w8, #16843009 // =0x1010101
 ; CHECK-NONEON-NEXT:    and w9, w9, #0x55555555
 ; CHECK-NONEON-NEXT:    sub w9, w0, w9
 ; CHECK-NONEON-NEXT:    lsr w10, w9, #2
@@ -155,7 +155,7 @@ define i64 @cnt64(i64 %x) nounwind readnone noimplicitfloat {
 ; CHECK-LABEL: cnt64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    lsr x9, x0, #1
-; CHECK-NEXT:    mov x8, #72340172838076673
+; CHECK-NEXT:    mov x8, #72340172838076673 // =0x101010101010101
 ; CHECK-NEXT:    and x9, x9, #0x5555555555555555
 ; CHECK-NEXT:    sub x9, x0, x9
 ; CHECK-NEXT:    lsr x10, x9, #2
@@ -171,7 +171,7 @@ define i64 @cnt64(i64 %x) nounwind readnone noimplicitfloat {
 ; CHECK-NONEON-LABEL: cnt64:
 ; CHECK-NONEON:       // %bb.0:
 ; CHECK-NONEON-NEXT:    lsr x9, x0, #1
-; CHECK-NONEON-NEXT:    mov x8, #72340172838076673
+; CHECK-NONEON-NEXT:    mov x8, #72340172838076673 // =0x101010101010101
 ; CHECK-NONEON-NEXT:    and x9, x9, #0x5555555555555555
 ; CHECK-NONEON-NEXT:    sub x9, x0, x9
 ; CHECK-NONEON-NEXT:    lsr x10, x9, #2
@@ -278,5 +278,59 @@ define i1 @ctpop32_ne_one(i32 %x) nounwind readnone {
   ret i1 %cmp
 }
 
+define i1 @ctpop32_eq_one_nonzero(i32 %x) {
+; CHECK-LABEL: ctpop32_eq_one_nonzero:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub w8, w0, #1
+; CHECK-NEXT:    tst w0, w8
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+;
+; CHECK-NONEON-LABEL: ctpop32_eq_one_nonzero:
+; CHECK-NONEON:       // %bb.0: // %entry
+; CHECK-NONEON-NEXT:    sub w8, w0, #1
+; CHECK-NONEON-NEXT:    tst w0, w8
+; CHECK-NONEON-NEXT:    cset w0, eq
+; CHECK-NONEON-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: ctpop32_eq_one_nonzero:
+; CHECK-CSSC:       // %bb.0: // %entry
+; CHECK-CSSC-NEXT:    sub w8, w0, #1
+; CHECK-CSSC-NEXT:    tst w0, w8
+; CHECK-CSSC-NEXT:    cset w0, eq
+; CHECK-CSSC-NEXT:    ret
+entry:
+  %popcnt = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x)
+  %cmp = icmp eq i32 %popcnt, 1
+  ret i1 %cmp
+}
+
+define i1 @ctpop32_ne_one_nonzero(i32 %x) {
+; CHECK-LABEL: ctpop32_ne_one_nonzero:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub w8, w0, #1
+; CHECK-NEXT:    tst w0, w8
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+;
+; CHECK-NONEON-LABEL: ctpop32_ne_one_nonzero:
+; CHECK-NONEON:       // %bb.0: // %entry
+; CHECK-NONEON-NEXT:    sub w8, w0, #1
+; CHECK-NONEON-NEXT:    tst w0, w8
+; CHECK-NONEON-NEXT:    cset w0, ne
+; CHECK-NONEON-NEXT:    ret
+;
+; CHECK-CSSC-LABEL: ctpop32_ne_one_nonzero:
+; CHECK-CSSC:       // %bb.0: // %entry
+; CHECK-CSSC-NEXT:    sub w8, w0, #1
+; CHECK-CSSC-NEXT:    tst w0, w8
+; CHECK-CSSC-NEXT:    cset w0, ne
+; CHECK-CSSC-NEXT:    ret
+entry:
+  %popcnt = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x)
+  %cmp = icmp ne i32 %popcnt, 1
+  ret i1 %cmp
+}
+
 declare i32 @llvm.ctpop.i32(i32) nounwind readnone
 declare i64 @llvm.ctpop.i64(i64) nounwind readnone
diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll
index e24b1b41645cdf..4c52047b928f4d 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll
@@ -1441,3 +1441,42 @@ define i32 @srai_slli2(i16 signext %0) {
   %3 = sext i16 %sext to i32
   ret i32 %3
 }
+
+define i1 @ctpop32_eq_one_nonzero(i32 %x) {
+; RV32I-LABEL: ctpop32_eq_one_nonzero:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi a1, a0, -1
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    seqz a0, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: ctpop32_eq_one_nonzero:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    cpop a0, a0
+; RV32ZBB-NEXT:    sltiu a0, a0, 2
+; RV32ZBB-NEXT:    ret
+entry:
+  %popcnt = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x)
+  %cmp = icmp eq i32 %popcnt, 1
+  ret i1 %cmp
+}
+
+define i1 @ctpop32_ne_one_nonzero(i32 %x) {
+; RV32I-LABEL: ctpop32_ne_one_nonzero:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi a1, a0, -1
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    snez a0, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: ctpop32_ne_one_nonzero:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    cpop a0, a0
+; RV32ZBB-NEXT:    sltiu a0, a0, 2
+; RV32ZBB-NEXT:    xori a0, a0, 1
+; RV32ZBB-NEXT:    ret
+entry:
+  %popcnt = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x)
+  %cmp = icmp ne i32 %popcnt, 1
+  ret i1 %cmp
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll
index 43a499806ab5ae..1e7814d588e4c0 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll
@@ -1618,3 +1618,84 @@ entry:
   %5 = add nsw i32 %4, %0
   ret i32 %5
 }
+
+define i1 @ctpop32_eq_one_nonzero(i32 %x) {
+; RV64I-LABEL: ctpop32_eq_one_nonzero:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi a1, a0, -1
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    seqz a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: ctpop32_eq_one_nonzero:
+; RV64ZBB:       # %bb.0: # %entry
+; RV64ZBB-NEXT:    cpopw a0, a0
+; RV64ZBB-NEXT:    sltiu a0, a0, 2
+; RV64ZBB-NEXT:    ret
+entry:
+  %popcnt = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x)
+  %cmp = icmp eq i32 %popcnt, 1
+  ret i1 %cmp
+}
+
+define i1 @ctpop32_ne_one_nonzero(i32 %x) {
+; RV64I-LABEL: ctpop32_ne_one_nonzero:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi a1, a0, -1
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    snez a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: ctpop32_ne_one_nonzero:
+; RV64ZBB:       # %bb.0: # %entry
+; RV64ZBB-NEXT:    cpopw a0, a0
+; RV64ZBB-NEXT:    sltiu a0, a0, 2
+; RV64ZBB-NEXT:    xori a0, a0, 1
+; RV64ZBB-NEXT:    ret
+entry:
+  %popcnt = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x)
+  %cmp = icmp ne i32 %popcnt, 1
+  ret i1 %cmp
+}
+
+define i1 @ctpop64_eq_one_nonzero(i64 %x) {
+; RV64I-LABEL: ctpop64_eq_one_nonzero:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi a1, a0, -1
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    seqz a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: ctpop64_eq_one_nonzero:
+; RV64ZBB:       # %bb.0: # %entry
+; RV64ZBB-NEXT:    cpop a0, a0
+; RV64ZBB-NEXT:    sltiu a0, a0, 2
+; RV64ZBB-NEXT:    ret
+entry:
+  %popcnt = call range(i64 1, 65) i64 @llvm.ctpop.i64(i64 %x)
+  %cmp = icmp eq i64 %popcnt, 1
+  ret i1 %cmp
+}
+
+define i1 @ctpop32_eq_one_maybezero(i32 %x) {
+; RV64I-LABEL: ctpop32_eq_one_maybezero:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addiw a1, a0, -1
+; RV64I-NEXT:    xor a0, a0, a1
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    sltu a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: ctpop32_eq_one_maybezero:
+; RV64ZBB:       # %bb.0: # %entry
+; RV64ZBB-NEXT:    cpopw a0, a0
+; RV64ZBB-NEXT:    addi a0, a0, -1
+; RV64ZBB-NEXT:    seqz a0, a0
+; RV64ZBB-NEXT:    ret
+entry:
+  %popcnt = call range(i32 0, 16) i32 @llvm.ctpop.i32(i32 %x)
+  %cmp = icmp eq i32 %popcnt, 1
+  ret i1 %cmp
+}
diff --git a/llvm/test/CodeGen/X86/ispow2.ll b/llvm/test/CodeGen/X86/ispow2.ll
index 8723432de8b6b0..649d257b28d762 100644
--- a/llvm/test/CodeGen/X86/ispow2.ll
+++ b/llvm/test/CodeGen/X86/ispow2.ll
@@ -102,7 +102,7 @@ define <4 x i1> @is_pow2_non_zero_4xv64(<4 x i64> %xin) {
 ; CHECK-AVX512:       # %bb.0:
 ; CHECK-AVX512-NEXT:    vporq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
 ; CHECK-AVX512-NEXT:    vpopcntq %ymm0, %ymm0
-; CHECK-AVX512-NEXT:    vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k1
+; CHECK-AVX512-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k1
 ; CHECK-AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; CHECK-AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; CHECK-AVX512-NEXT:    vzeroupper
@@ -155,7 +155,7 @@ define <4 x i1> @neither_pow2_non_zero_4xv64(<4 x i64> %xin) {
 ; CHECK-AVX512:       # %bb.0:
 ; CHECK-AVX512-NEXT:    vporq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
 ; CHECK-AVX512-NEXT:    vpopcntq %ymm0, %ymm0
-; CHECK-AVX512-NEXT:    vpcmpneqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k1
+; CHECK-AVX512-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k1
 ; CHECK-AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; CHECK-AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; CHECK-AVX512-NEXT:    vzeroupper
@@ -220,3 +220,44 @@ define <4 x i1> @neither_pow2_non_zero_4xv64_x_maybe_z(<4 x i64> %x) {
   %r = icmp ne <4 x i64> %cnt, <i64 1, i64 1, i64 1, i64 1>
   ret <4 x i1> %r
 }
+
+
+define i1 @ctpop32_eq_one_nonzero(i32 %x) {
+; CHECK-NOBMI-LABEL: ctpop32_eq_one_nonzero:
+; CHECK-NOBMI:       # %bb.0: # %entry
+; CHECK-NOBMI-NEXT:    # kill: def $edi killed $edi def $rdi
+; CHECK-NOBMI-NEXT:    leal -1(%rdi), %eax
+; CHECK-NOBMI-NEXT:    testl %eax, %edi
+; CHECK-NOBMI-NEXT:    sete %al
+; CHECK-NOBMI-NEXT:    retq
+;
+; CHECK-BMI2-LABEL: ctpop32_eq_one_nonzero:
+; CHECK-BMI2:       # %bb.0: # %entry
+; CHECK-BMI2-NEXT:    blsrl %edi, %eax
+; CHECK-BMI2-NEXT:    sete %al
+; CHECK-BMI2-NEXT:    retq
+entry:
+  %popcnt = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x)
+  %cmp = icmp eq i32 %popcnt, 1
+  ret i1 %cmp
+}
+
+define i1 @ctpop32_ne_one_nonzero(i32 %x) {
+; CHECK-NOBMI-LABEL: ctpop32_ne_one_nonzero:
+; CHECK-NOBMI:       # %bb.0: # %entry
+; CHECK-NOBMI-NEXT:    # kill: def $edi killed $edi def $rdi
+; CHECK-NOBMI-NEXT:    leal -1(%rdi), %eax
+; CHECK-NOBMI-NEXT:    testl %eax, %edi
+; CHECK-NOBMI-NEXT:    setne %al
+; CHECK-NOBMI-NEXT:    retq
+;
+; CHECK-BMI2-LABEL: ctpop32_ne_one_nonzero:
+; CHECK-BMI2:       # %bb.0: # %entry
+; CHECK-BMI2-NEXT:    blsrl %edi, %eax
+; CHECK-BMI2-NEXT:    setne %al
+; CHECK-BMI2-NEXT:    retq
+entry:
+  %popcnt = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x)
+  %cmp = icmp ne i32 %popcnt, 1
+  ret i1 %cmp
+}
diff --git a/llvm/test/CodeGen/X86/known-never-zero.ll b/llvm/test/CodeGen/X86/known-never-zero.ll
index ac41a3fe6bb7e4..6c0aaeb451e14a 100644
--- a/llvm/test/CodeGen/X86/known-never-zero.ll
+++ b/llvm/test/CodeGen/X86/known-never-zero.ll
@@ -555,9 +555,9 @@ define <4 x i32> @smax_known_zero_vec(<4 x i32> %x, <4 x i32> %y) {
 ; X86-NEXT:    por %xmm2, %xmm0
 ; X86-NEXT:    pcmpeqd %xmm1, %xmm1
 ; X86-NEXT:    paddd %xmm0, %xmm1
-; X86-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-NEXT:    pxor %xmm1, %xmm0
-; X86-NEXT:    pcmpgtd %xmm1, %xmm0
+; X86-NEXT:    pand %xmm1, %xmm0
+; X86-NEXT:    pxor %xmm1, %xmm1
+; X86-NEXT:    pcmpeqd %xmm1, %xmm0
 ; X86-NEXT:    psrld $31, %xmm0
 ; X86-NEXT:    retl
 ;
@@ -566,10 +566,10 @@ define <4 x i32> @smax_known_zero_vec(<4 x i32> %x, <4 x i32> %y) {
 ; X64-NEXT:    vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; X64-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
-; X64-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; X64-NEXT:    vpminud %xmm1, %xmm0, %xmm1
+; X64-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X64-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
-; X64-NEXT:    vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; X64-NEXT:    retq
   %z = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %x, <4 x i32> <i32 54, i32 23, i32 12, i32 1>)
   %r = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %z)

From cbfcea1fc2154c92880278878610e16faba979be Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Thu, 10 Oct 2024 20:18:54 -0500
Subject: [PATCH 109/177] [libc] Temporarily disable strerror test on NVPTX

Summary:
This is failing on the NVPTX buildbot,
https://lab.llvm.org/buildbot/#/builders/69/builds/6997/. I cannot
reproduce it locally so I'm disabling it temporarily so the bot is
green.
---
 libc/test/src/string/CMakeLists.txt | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/libc/test/src/string/CMakeLists.txt b/libc/test/src/string/CMakeLists.txt
index c1caec5fd912c8..44535957e740be 100644
--- a/libc/test/src/string/CMakeLists.txt
+++ b/libc/test/src/string/CMakeLists.txt
@@ -215,16 +215,18 @@ add_libc_test(
     libc.src.errno.errno
 )
 
-add_libc_test(
-  strerror_test
-  SUITE
-    libc-string-tests
-  SRCS
-    strerror_test.cpp
-  DEPENDS
-    libc.src.string.strerror
-)
-
+# FIXME: This is failing on the bot for some reason, disable for now.
+if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
+  add_libc_test(
+    strerror_test
+    SUITE
+      libc-string-tests
+    SRCS
+      strerror_test.cpp
+    DEPENDS
+      libc.src.string.strerror
+  )
+endif()
 
 add_libc_test(
   strerror_r_test

From 9f8ae7844dee7bb5527a59249e74885fb3bfb4a9 Mon Sep 17 00:00:00 2001
From: Adrian Vogelsgesang <avogelsgesang@salesforce.com>
Date: Fri, 11 Oct 2024 03:31:26 +0200
Subject: [PATCH 110/177] [lldb-dap] Implement value locations for function
 pointers (#104589)

This commit adds `valueLocationReference` to function pointers and
function references. Thereby, users can navigate directly to the
pointed-to function from within the "variables" pane.

In general, it would be useful to also a add similar location references
also to member function pointers, `std::source_location`,
`std::function`, and many more. Doing so would require extending the
formatters to provide such a source code location.

There were two RFCs about this a while ago:

https://discourse.llvm.org/t/rfc-extending-formatters-with-a-source-code-reference/68375
https://discourse.llvm.org/t/rfc-sbvalue-metadata-provider/68377/26

However, both RFCs ended without a conclusion. As such, this commit now
implements the lowest-hanging fruit, i.e. function pointers. If people
find it useful, I will revive the RFC afterwards.
---
 .../API/tools/lldb-dap/locations/Makefile     |   2 +-
 .../lldb-dap/locations/TestDAP_locations.py   |  49 +++++++-
 lldb/test/API/tools/lldb-dap/locations/main.c |   5 -
 .../API/tools/lldb-dap/locations/main.cpp     |  10 ++
 lldb/tools/lldb-dap/JSONUtils.cpp             |  41 ++++++-
 lldb/tools/lldb-dap/JSONUtils.h               |  10 ++
 lldb/tools/lldb-dap/lldb-dap.cpp              | 115 ++++++++++++++----
 7 files changed, 192 insertions(+), 40 deletions(-)
 delete mode 100644 lldb/test/API/tools/lldb-dap/locations/main.c
 create mode 100644 lldb/test/API/tools/lldb-dap/locations/main.cpp

diff --git a/lldb/test/API/tools/lldb-dap/locations/Makefile b/lldb/test/API/tools/lldb-dap/locations/Makefile
index 10495940055b63..99998b20bcb050 100644
--- a/lldb/test/API/tools/lldb-dap/locations/Makefile
+++ b/lldb/test/API/tools/lldb-dap/locations/Makefile
@@ -1,3 +1,3 @@
-C_SOURCES := main.c
+CXX_SOURCES := main.cpp
 
 include Makefile.rules
diff --git a/lldb/test/API/tools/lldb-dap/locations/TestDAP_locations.py b/lldb/test/API/tools/lldb-dap/locations/TestDAP_locations.py
index 76d938d3908492..45f836a2fa3c39 100644
--- a/lldb/test/API/tools/lldb-dap/locations/TestDAP_locations.py
+++ b/lldb/test/API/tools/lldb-dap/locations/TestDAP_locations.py
@@ -19,11 +19,11 @@ def test_locations(self):
         """
         program = self.getBuildArtifact("a.out")
         self.build_and_launch(program)
-        source = "main.c"
+        source = "main.cpp"
         self.source_path = os.path.join(os.getcwd(), source)
         self.set_source_breakpoints(
             source,
-            [line_number(source, "// BREAK HERE")],
+            [line_number(source, "break here")],
         )
         self.continue_to_next_stop()
 
@@ -36,5 +36,46 @@ def test_locations(self):
             locals["var1"]["declarationLocationReference"]
         )
         self.assertTrue(loc_var1["success"])
-        self.assertTrue(loc_var1["body"]["source"]["path"].endswith("main.c"))
-        self.assertEqual(loc_var1["body"]["line"], 2)
+        self.assertTrue(loc_var1["body"]["source"]["path"].endswith("main.cpp"))
+        self.assertEqual(loc_var1["body"]["line"], 6)
+
+        # func_ptr has both a declaration and a valueLocation
+        self.assertIn("declarationLocationReference", locals["func_ptr"].keys())
+        self.assertIn("valueLocationReference", locals["func_ptr"].keys())
+        decl_loc_func_ptr = self.dap_server.request_locations(
+            locals["func_ptr"]["declarationLocationReference"]
+        )
+        self.assertTrue(decl_loc_func_ptr["success"])
+        self.assertTrue(
+            decl_loc_func_ptr["body"]["source"]["path"].endswith("main.cpp")
+        )
+        self.assertEqual(decl_loc_func_ptr["body"]["line"], 7)
+        val_loc_func_ptr = self.dap_server.request_locations(
+            locals["func_ptr"]["valueLocationReference"]
+        )
+        self.assertTrue(val_loc_func_ptr["success"])
+        self.assertTrue(val_loc_func_ptr["body"]["source"]["path"].endswith("main.cpp"))
+        self.assertEqual(val_loc_func_ptr["body"]["line"], 3)
+
+        # func_ref has both a declaration and a valueLocation
+        self.assertIn("declarationLocationReference", locals["func_ref"].keys())
+        self.assertIn("valueLocationReference", locals["func_ref"].keys())
+        decl_loc_func_ref = self.dap_server.request_locations(
+            locals["func_ref"]["declarationLocationReference"]
+        )
+        self.assertTrue(decl_loc_func_ref["success"])
+        self.assertTrue(
+            decl_loc_func_ref["body"]["source"]["path"].endswith("main.cpp")
+        )
+        self.assertEqual(decl_loc_func_ref["body"]["line"], 8)
+        val_loc_func_ref = self.dap_server.request_locations(
+            locals["func_ref"]["valueLocationReference"]
+        )
+        self.assertTrue(val_loc_func_ref["success"])
+        self.assertTrue(val_loc_func_ref["body"]["source"]["path"].endswith("main.cpp"))
+        self.assertEqual(val_loc_func_ref["body"]["line"], 3)
+
+        # `evaluate` responses for function pointers also have locations associated
+        eval_res = self.dap_server.request_evaluate("greet")
+        self.assertTrue(eval_res["success"])
+        self.assertIn("valueLocationReference", eval_res["body"].keys())
diff --git a/lldb/test/API/tools/lldb-dap/locations/main.c b/lldb/test/API/tools/lldb-dap/locations/main.c
deleted file mode 100644
index 6a8c86d00cb562..00000000000000
--- a/lldb/test/API/tools/lldb-dap/locations/main.c
+++ /dev/null
@@ -1,5 +0,0 @@
-int main(void) {
-  int var1 = 1;
-  // BREAK HERE
-  return 0;
-}
diff --git a/lldb/test/API/tools/lldb-dap/locations/main.cpp b/lldb/test/API/tools/lldb-dap/locations/main.cpp
new file mode 100644
index 00000000000000..fb7789ffd86fdf
--- /dev/null
+++ b/lldb/test/API/tools/lldb-dap/locations/main.cpp
@@ -0,0 +1,10 @@
+#include <cstdio>
+
+void greet() { printf("Hello"); }
+
+int main(void) {
+  int var1 = 1;
+  void (*func_ptr)() = &greet;
+  void (&func_ref)() = greet;
+  return 0; // break here
+}
diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp
index 558f889c4b7f23..e42a6d9d699804 100644
--- a/lldb/tools/lldb-dap/JSONUtils.cpp
+++ b/lldb/tools/lldb-dap/JSONUtils.cpp
@@ -1223,6 +1223,25 @@ std::string VariableDescription::GetResult(llvm::StringRef context) {
   return description.trim().str();
 }
 
+bool ValuePointsToCode(lldb::SBValue v) {
+  if (!v.GetType().GetPointeeType().IsFunctionType())
+    return false;
+
+  lldb::addr_t addr = v.GetValueAsAddress();
+  lldb::SBLineEntry line_entry =
+      g_dap.target.ResolveLoadAddress(addr).GetLineEntry();
+
+  return line_entry.IsValid();
+}
+
+int64_t PackLocation(int64_t var_ref, bool is_value_location) {
+  return var_ref << 1 | is_value_location;
+}
+
+std::pair<int64_t, bool> UnpackLocation(int64_t location_id) {
+  return std::pair{location_id >> 1, location_id & 1};
+}
+
 // "Variable": {
 //   "type": "object",
 //   "description": "A Variable is a name/value pair. Optionally a variable
@@ -1302,6 +1321,18 @@ std::string VariableDescription::GetResult(llvm::StringRef context) {
 //                       Object References' in the Overview section for
 //                       details."
 //     },
+//     "valueLocationReference": {
+//       "type": "integer",
+//       "description": "A reference that allows the client to request the
+//                       location where the variable's value is declared. For
+//                       example, if the variable contains a function pointer,
+//                       the adapter may be able to look up the function's
+//                       location. This should be present only if the adapter
+//                       is likely to be able to resolve the location.\n\nThis
+//                       reference shares the same lifetime as the
+//                       `variablesReference`. See 'Lifetime of Object
+//                       References' in the Overview section for details."
+//     },
 //
 //     "$__lldb_extensions": {
 //       "description": "Unofficial extensions to the protocol",
@@ -1415,7 +1446,11 @@ llvm::json::Value CreateVariable(lldb::SBValue v, int64_t var_ref,
     object.try_emplace("variablesReference", 0);
 
   if (v.GetDeclaration().IsValid())
-    object.try_emplace("declarationLocationReference", var_ref);
+    object.try_emplace("declarationLocationReference",
+                       PackLocation(var_ref, false));
+
+  if (ValuePointsToCode(v))
+    object.try_emplace("valueLocationReference", PackLocation(var_ref, true));
 
   if (lldb::addr_t addr = v.GetLoadAddress(); addr != LLDB_INVALID_ADDRESS)
     object.try_emplace("memoryReference", EncodeMemoryReference(addr));
@@ -1441,8 +1476,8 @@ CreateRunInTerminalReverseRequest(const llvm::json::Object &launch_request,
                                   llvm::StringRef comm_file,
                                   lldb::pid_t debugger_pid) {
   llvm::json::Object run_in_terminal_args;
-  // This indicates the IDE to open an embedded terminal, instead of opening the
-  // terminal in a new window.
+  // This indicates the IDE to open an embedded terminal, instead of opening
+  // the terminal in a new window.
   run_in_terminal_args.try_emplace("kind", "integrated");
 
   auto launch_request_arguments = launch_request.getObject("arguments");
diff --git a/lldb/tools/lldb-dap/JSONUtils.h b/lldb/tools/lldb-dap/JSONUtils.h
index 18cfb4081fece1..54fc4323475723 100644
--- a/lldb/tools/lldb-dap/JSONUtils.h
+++ b/lldb/tools/lldb-dap/JSONUtils.h
@@ -480,6 +480,16 @@ struct VariableDescription {
   std::string GetResult(llvm::StringRef context);
 };
 
+/// Does the given variable have an associated value location?
+bool ValuePointsToCode(lldb::SBValue v);
+
+/// Pack a location into a single integer which we can send via
+/// the debug adapter protocol.
+int64_t PackLocation(int64_t var_ref, bool is_value_location);
+
+/// Reverse of `PackLocation`
+std::pair<int64_t, bool> UnpackLocation(int64_t location_id);
+
 /// Create a "Variable" object for a LLDB thread object.
 ///
 /// This function will fill in the following keys in the returned
diff --git a/lldb/tools/lldb-dap/lldb-dap.cpp b/lldb/tools/lldb-dap/lldb-dap.cpp
index ac18e8f24a4e39..a167088c8901ca 100644
--- a/lldb/tools/lldb-dap/lldb-dap.cpp
+++ b/lldb/tools/lldb-dap/lldb-dap.cpp
@@ -1561,6 +1561,19 @@ void request_completions(const llvm::json::Object &request) {
 //                              client can use this optional information to
 //                              present the variables in a paged UI and fetch
 //                              them in chunks."
+//            },
+//            "valueLocationReference": {
+//              "type": "integer",
+//              "description": "A reference that allows the client to request
+//                              the location where the returned value is
+//                              declared. For example, if a function pointer is
+//                              returned, the adapter may be able to look up the
+//                              function's location. This should be present only
+//                              if the adapter is likely to be able to resolve
+//                              the location.\n\nThis reference shares the same
+//                              lifetime as the `variablesReference`. See
+//                              'Lifetime of Object References' in the
+//              Overview section for details."
 //            }
 //            "memoryReference": {
 //               "type": "string",
@@ -1647,16 +1660,19 @@ void request_evaluate(const llvm::json::Object &request) {
       VariableDescription desc(value);
       EmplaceSafeString(body, "result", desc.GetResult(context));
       EmplaceSafeString(body, "type", desc.display_type_name);
-      if (value.MightHaveChildren()) {
-        auto variableReference = g_dap.variables.InsertVariable(
+      int64_t var_ref = 0;
+      if (value.MightHaveChildren() || ValuePointsToCode(value))
+        var_ref = g_dap.variables.InsertVariable(
             value, /*is_permanent=*/context == "repl");
-        body.try_emplace("variablesReference", variableReference);
-      } else {
+      if (value.MightHaveChildren())
+        body.try_emplace("variablesReference", var_ref);
+      else
         body.try_emplace("variablesReference", (int64_t)0);
-      }
       if (lldb::addr_t addr = value.GetLoadAddress();
           addr != LLDB_INVALID_ADDRESS)
         body.try_emplace("memoryReference", EncodeMemoryReference(addr));
+      if (ValuePointsToCode(value))
+        body.try_emplace("valueLocationReference", var_ref);
     }
   }
   response.try_emplace("body", std::move(body));
@@ -3770,6 +3786,17 @@ void request_threads(const llvm::json::Object &request) {
 //             "description": "The number of indexed child variables. The client
 //             can use this optional information to present the variables in a
 //             paged UI and fetch them in chunks."
+//           },
+//           "valueLocationReference": {
+//             "type": "integer",
+//             "description": "A reference that allows the client to request the
+//             location where the new value is declared. For example, if the new
+//             value is function pointer, the adapter may be able to look up the
+//             function's location. This should be present only if the adapter
+//             is likely to be able to resolve the location.\n\nThis reference
+//             shares the same lifetime as the `variablesReference`. See
+//             'Lifetime of Object References' in the Overview section for
+//             details."
 //           }
 //         },
 //         "required": [ "value" ]
@@ -3794,7 +3821,6 @@ void request_setVariable(const llvm::json::Object &request) {
   response.try_emplace("success", false);
 
   lldb::SBValue variable;
-  int64_t newVariablesReference = 0;
 
   // The "id" is the unique integer ID that is unique within the enclosing
   // variablesReference. It is optionally added to any "interface Variable"
@@ -3824,14 +3850,17 @@ void request_setVariable(const llvm::json::Object &request) {
       // so always insert a new one to get its variablesReference.
       // is_permanent is false because debug console does not support
       // setVariable request.
+      int64_t new_var_ref =
+          g_dap.variables.InsertVariable(variable, /*is_permanent=*/false);
       if (variable.MightHaveChildren())
-        newVariablesReference =
-            g_dap.variables.InsertVariable(variable, /*is_permanent=*/false);
-      body.try_emplace("variablesReference", newVariablesReference);
-
+        body.try_emplace("variablesReference", new_var_ref);
+      else
+        body.try_emplace("variablesReference", 0);
       if (lldb::addr_t addr = variable.GetLoadAddress();
           addr != LLDB_INVALID_ADDRESS)
         body.try_emplace("memoryReference", EncodeMemoryReference(addr));
+      if (ValuePointsToCode(variable))
+        body.try_emplace("valueLocationReference", new_var_ref);
     } else {
       EmplaceSafeString(body, "message", std::string(error.GetCString()));
     }
@@ -4122,10 +4151,13 @@ void request_variables(const llvm::json::Object &request) {
 void request_locations(const llvm::json::Object &request) {
   llvm::json::Object response;
   FillResponse(request, response);
-  auto arguments = request.getObject("arguments");
+  auto *arguments = request.getObject("arguments");
 
-  uint64_t reference_id = GetUnsigned(arguments, "locationReference", 0);
-  lldb::SBValue variable = g_dap.variables.GetVariable(reference_id);
+  uint64_t location_id = GetUnsigned(arguments, "locationReference", 0);
+  // We use the lowest bit to distinguish between value location and declaration
+  // location
+  auto [var_ref, is_value_location] = UnpackLocation(location_id);
+  lldb::SBValue variable = g_dap.variables.GetVariable(var_ref);
   if (!variable.IsValid()) {
     response["success"] = false;
     response["message"] = "Invalid variable reference";
@@ -4133,21 +4165,50 @@ void request_locations(const llvm::json::Object &request) {
     return;
   }
 
-  // Get the declaration location
-  lldb::SBDeclaration decl = variable.GetDeclaration();
-  if (!decl.IsValid()) {
-    response["success"] = false;
-    response["message"] = "No declaration location available";
-    g_dap.SendJSON(llvm::json::Value(std::move(response)));
-    return;
-  }
-
   llvm::json::Object body;
-  body.try_emplace("source", CreateSource(decl.GetFileSpec()));
-  if (int line = decl.GetLine())
-    body.try_emplace("line", line);
-  if (int column = decl.GetColumn())
-    body.try_emplace("column", column);
+  if (is_value_location) {
+    // Get the value location
+    if (!variable.GetType().IsPointerType() &&
+        !variable.GetType().IsReferenceType()) {
+      response["success"] = false;
+      response["message"] =
+          "Value locations are only available for pointers and references";
+      g_dap.SendJSON(llvm::json::Value(std::move(response)));
+      return;
+    }
+
+    lldb::addr_t addr = variable.GetValueAsAddress();
+    lldb::SBLineEntry line_entry =
+        g_dap.target.ResolveLoadAddress(addr).GetLineEntry();
+
+    if (!line_entry.IsValid()) {
+      response["success"] = false;
+      response["message"] = "Failed to resolve line entry for location";
+      g_dap.SendJSON(llvm::json::Value(std::move(response)));
+      return;
+    }
+
+    body.try_emplace("source", CreateSource(line_entry.GetFileSpec()));
+    if (int line = line_entry.GetLine())
+      body.try_emplace("line", line);
+    if (int column = line_entry.GetColumn())
+      body.try_emplace("column", column);
+  } else {
+    // Get the declaration location
+    lldb::SBDeclaration decl = variable.GetDeclaration();
+    if (!decl.IsValid()) {
+      response["success"] = false;
+      response["message"] = "No declaration location available";
+      g_dap.SendJSON(llvm::json::Value(std::move(response)));
+      return;
+    }
+
+    body.try_emplace("source", CreateSource(decl.GetFileSpec()));
+    if (int line = decl.GetLine())
+      body.try_emplace("line", line);
+    if (int column = decl.GetColumn())
+      body.try_emplace("column", column);
+  }
 
   response.try_emplace("body", std::move(body));
   g_dap.SendJSON(llvm::json::Value(std::move(response)));

From 9882b35a3a3e46d749b801bd0b98c3d90af6006c Mon Sep 17 00:00:00 2001
From: Phoebe Wang <phoebe.wang@intel.com>
Date: Fri, 11 Oct 2024 10:18:40 +0800
Subject: [PATCH 111/177] [X86][StrictFP] Combine fcmp + select to fmin/fmax
 for some predicates (#109512)

X86 maxss/minss etc. instructions won't turn SNaN to QNaN, so we can
combine fcmp + select to them for some predicates.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  32 ++--
 llvm/lib/Target/X86/X86ISelLowering.h         |   4 +
 llvm/lib/Target/X86/X86InstrAVX512.td         |   8 +-
 llvm/lib/Target/X86/X86InstrFragmentsSIMD.td  |  12 ++
 llvm/lib/Target/X86/X86InstrSSE.td            |   8 +-
 llvm/test/CodeGen/X86/fp-strict-scalar-cmp.ll | 149 +++++++++++++++++-
 6 files changed, 195 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 77c10baa31bd21..7a6d20c6a121b6 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -34219,10 +34219,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(FMAXS)
   NODE_NAME_CASE(FMAX_SAE)
   NODE_NAME_CASE(FMAXS_SAE)
+  NODE_NAME_CASE(STRICT_FMAX)
   NODE_NAME_CASE(FMIN)
   NODE_NAME_CASE(FMINS)
   NODE_NAME_CASE(FMIN_SAE)
   NODE_NAME_CASE(FMINS_SAE)
+  NODE_NAME_CASE(STRICT_FMIN)
   NODE_NAME_CASE(FMAXC)
   NODE_NAME_CASE(FMINC)
   NODE_NAME_CASE(FRSQRT)
@@ -46461,17 +46463,21 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
   // x<=y?x:y, because of how they handle negative zero (which can be
   // ignored in unsafe-math mode).
   // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
-  if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
-      VT != MVT::f80 && VT != MVT::f128 && !isSoftF16(VT, Subtarget) &&
-      (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
+  if ((Cond.getOpcode() == ISD::SETCC ||
+       Cond.getOpcode() == ISD::STRICT_FSETCCS) &&
+      VT.isFloatingPoint() && VT != MVT::f80 && VT != MVT::f128 &&
+      !isSoftF16(VT, Subtarget) && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
       (Subtarget.hasSSE2() ||
        (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
-    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+    bool IsStrict = Cond->isStrictFPOpcode();
+    ISD::CondCode CC =
+        cast<CondCodeSDNode>(Cond.getOperand(IsStrict ? 3 : 2))->get();
+    SDValue Op0 = Cond.getOperand(IsStrict ? 1 : 0);
+    SDValue Op1 = Cond.getOperand(IsStrict ? 2 : 1);
 
     unsigned Opcode = 0;
     // Check for x CC y ? x : y.
-    if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
-        DAG.isEqualTo(RHS, Cond.getOperand(1))) {
+    if (DAG.isEqualTo(LHS, Op0) && DAG.isEqualTo(RHS, Op1)) {
       switch (CC) {
       default: break;
       case ISD::SETULT:
@@ -46539,8 +46545,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
         break;
       }
     // Check for x CC y ? y : x -- a min/max with reversed arms.
-    } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
-               DAG.isEqualTo(RHS, Cond.getOperand(0))) {
+    } else if (DAG.isEqualTo(LHS, Op1) && DAG.isEqualTo(RHS, Op0)) {
       switch (CC) {
       default: break;
       case ISD::SETOGE:
@@ -46605,8 +46610,17 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
       }
     }
 
-    if (Opcode)
+    if (Opcode) {
+      if (IsStrict) {
+        SDValue Ret = DAG.getNode(Opcode == X86ISD::FMIN ? X86ISD::STRICT_FMIN
+                                                         : X86ISD::STRICT_FMAX,
+                                  DL, {N->getValueType(0), MVT::Other},
+                                  {Cond.getOperand(0), LHS, RHS});
+        DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Ret.getValue(1));
+        return Ret;
+      }
       return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
+    }
   }
 
   // Some mask scalar intrinsics rely on checking if only one bit is set
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index a2515ff35e6925..3b1bd0ad9a267e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -850,6 +850,10 @@ namespace llvm {
     // Perform an FP80 add after changing precision control in FPCW.
     STRICT_FP80_ADD,
 
+    /// Floating point max and min.
+    STRICT_FMAX,
+    STRICT_FMIN,
+
     // WARNING: Only add nodes here if they are strict FP nodes. Non-memory and
     // non-strict FP nodes should be above FIRST_TARGET_STRICTFP_OPCODE.
 
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index b9ff4a5280ec3e..98c31867e6b22b 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -5395,7 +5395,7 @@ multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo
                           EVEX_B, EVEX_RC, Sched<[sched]>;
 }
 multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
-                                SDNode OpNode, SDNode VecNode, SDNode SaeNode,
+                                SDPatternOperator OpNode, SDNode VecNode, SDNode SaeNode,
                                 X86FoldableSchedWrite sched, bit IsCommutable> {
   let ExeDomain = _.ExeDomain in {
   defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
@@ -5458,7 +5458,7 @@ multiclass avx512_binop_s_round<bits<8> opc, string OpcodeStr, SDPatternOperator
                                 T_MAP5, XS, EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>;
 }
 
-multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
                               SDNode VecNode, SDNode SaeNode,
                               X86SchedWriteSizes sched, bit IsCommutable> {
   defm SSZ : avx512_fp_scalar_sae<opc, OpcodeStr#"ss", f32x_info, OpNode,
@@ -5481,9 +5481,9 @@ defm VSUB : avx512_binop_s_round<0x5C, "vsub", any_fsub, X86fsubs, X86fsubRnds,
                                  SchedWriteFAddSizes, 0>;
 defm VDIV : avx512_binop_s_round<0x5E, "vdiv", any_fdiv, X86fdivs, X86fdivRnds,
                                  SchedWriteFDivSizes, 0>;
-defm VMIN : avx512_binop_s_sae<0x5D, "vmin", X86fmin, X86fmins, X86fminSAEs,
+defm VMIN : avx512_binop_s_sae<0x5D, "vmin", X86any_fmin, X86fmins, X86fminSAEs,
                                SchedWriteFCmpSizes, 0>;
-defm VMAX : avx512_binop_s_sae<0x5F, "vmax", X86fmax, X86fmaxs, X86fmaxSAEs,
+defm VMAX : avx512_binop_s_sae<0x5F, "vmax", X86any_fmax, X86fmaxs, X86fmaxSAEs,
                                SchedWriteFCmpSizes, 0>;
 
 // MIN/MAX nodes are commutable under "unsafe-fp-math". In this case we use
diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index ed1bff05b7316c..c09522709d2f0d 100644
--- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -46,6 +46,18 @@ def X86fminc    : SDNode<"X86ISD::FMINC", SDTFPBinOp,
 def X86fmaxc    : SDNode<"X86ISD::FMAXC", SDTFPBinOp,
     [SDNPCommutative, SDNPAssociative]>;
 
+def X86strict_fmin : SDNode<"X86ISD::STRICT_FMIN", SDTFPBinOp,
+                            [SDNPHasChain]>;
+def X86strict_fmax : SDNode<"X86ISD::STRICT_FMAX", SDTFPBinOp,
+                            [SDNPHasChain]>;
+
+def X86any_fmin : PatFrags<(ops node:$src1, node:$src2),
+                           [(X86strict_fmin node:$src1, node:$src2),
+                            (X86fmin node:$src1, node:$src2)]>;
+def X86any_fmax : PatFrags<(ops node:$src1, node:$src2),
+                           [(X86strict_fmax node:$src1, node:$src2),
+                            (X86fmax node:$src1, node:$src2)]>;
+
 def X86fand    : SDNode<"X86ISD::FAND",      SDTFPBinOp,
                         [SDNPCommutative, SDNPAssociative]>;
 def X86for     : SDNode<"X86ISD::FOR",       SDTFPBinOp,
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index d51125a209db9d..e77e56aa96c670 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -2730,11 +2730,11 @@ let isCommutable = 0 in {
   defm DIV : basic_sse12_fp_binop_p<0x5E, "div", any_fdiv, SchedWriteFDivSizes>,
              basic_sse12_fp_binop_s<0x5E, "div", any_fdiv, SchedWriteFDivSizes>,
              basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, SchedWriteFDivSizes>;
-  defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
-             basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
+  defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86any_fmax, SchedWriteFCmpSizes>,
+             basic_sse12_fp_binop_s<0x5F, "max", X86any_fmax, SchedWriteFCmpSizes>,
              basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SchedWriteFCmpSizes>;
-  defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
-             basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
+  defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86any_fmin, SchedWriteFCmpSizes>,
+             basic_sse12_fp_binop_s<0x5D, "min", X86any_fmin, SchedWriteFCmpSizes>,
              basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SchedWriteFCmpSizes>;
 }
 
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-cmp.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-cmp.ll
index cb1876fee05aea..e3e2b6225a7ba0 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-cmp.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-cmp.ll
@@ -4202,7 +4202,154 @@ define void @foo(float %0, float %1) #0 {
 }
 declare dso_local void @bar()
 
-attributes #0 = { strictfp }
+define float @fcmp_select_ogt(float %f1, float %f2) #0 {
+; SSE-32-LABEL: fcmp_select_ogt:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %eax
+; SSE-32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-32-NEXT:    maxss {{[0-9]+}}(%esp), %xmm0
+; SSE-32-NEXT:    movss %xmm0, (%esp)
+; SSE-32-NEXT:    flds (%esp)
+; SSE-32-NEXT:    wait
+; SSE-32-NEXT:    popl %eax
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: fcmp_select_ogt:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    maxss %xmm1, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: fcmp_select_ogt:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %eax
+; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-32-NEXT:    vmaxss {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT:    vmovss %xmm0, (%esp)
+; AVX-32-NEXT:    flds (%esp)
+; AVX-32-NEXT:    wait
+; AVX-32-NEXT:    popl %eax
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: fcmp_select_ogt:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
+; AVX-64-NEXT:    retq
+;
+; X87-LABEL: fcmp_select_ogt:
+; X87:       # %bb.0:
+; X87-NEXT:    flds {{[0-9]+}}(%esp)
+; X87-NEXT:    flds {{[0-9]+}}(%esp)
+; X87-NEXT:    fcom %st(1)
+; X87-NEXT:    wait
+; X87-NEXT:    fnstsw %ax
+; X87-NEXT:    # kill: def $ah killed $ah killed $ax
+; X87-NEXT:    sahf
+; X87-NEXT:    ja .LBB57_2
+; X87-NEXT:  # %bb.1:
+; X87-NEXT:    fstp %st(0)
+; X87-NEXT:    fldz
+; X87-NEXT:    fxch %st(1)
+; X87-NEXT:  .LBB57_2:
+; X87-NEXT:    fstp %st(1)
+; X87-NEXT:    wait
+; X87-NEXT:    retl
+;
+; X87-CMOV-LABEL: fcmp_select_ogt:
+; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; X87-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; X87-CMOV-NEXT:    fcomi %st(1), %st
+; X87-CMOV-NEXT:    fxch %st(1)
+; X87-CMOV-NEXT:    fcmovnbe %st(1), %st
+; X87-CMOV-NEXT:    fstp %st(1)
+; X87-CMOV-NEXT:    wait
+; X87-CMOV-NEXT:    retl
+  %cond = call i1 @llvm.experimental.constrained.fcmps.f32(
+                                               float %f1, float %f2, metadata !"ogt",
+                                               metadata !"fpexcept.strict")
+  %res = select i1 %cond, float %f1, float %f2
+  ret float %res
+}
+
+define double @fcmp_select_ule(double %f1, double %f2) #0 {
+; SSE-32-LABEL: fcmp_select_ule:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-8, %esp
+; SSE-32-NEXT:    subl $8, %esp
+; SSE-32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-32-NEXT:    minsd 8(%ebp), %xmm0
+; SSE-32-NEXT:    movsd %xmm0, (%esp)
+; SSE-32-NEXT:    fldl (%esp)
+; SSE-32-NEXT:    wait
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: fcmp_select_ule:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    minsd %xmm0, %xmm1
+; SSE-64-NEXT:    movapd %xmm1, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: fcmp_select_ule:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-8, %esp
+; AVX-32-NEXT:    subl $8, %esp
+; AVX-32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-32-NEXT:    vminsd 8(%ebp), %xmm0, %xmm0
+; AVX-32-NEXT:    vmovsd %xmm0, (%esp)
+; AVX-32-NEXT:    fldl (%esp)
+; AVX-32-NEXT:    wait
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: fcmp_select_ule:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vminsd %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
+;
+; X87-LABEL: fcmp_select_ule:
+; X87:       # %bb.0:
+; X87-NEXT:    fldl {{[0-9]+}}(%esp)
+; X87-NEXT:    fldl {{[0-9]+}}(%esp)
+; X87-NEXT:    fcom %st(1)
+; X87-NEXT:    wait
+; X87-NEXT:    fnstsw %ax
+; X87-NEXT:    # kill: def $ah killed $ah killed $ax
+; X87-NEXT:    sahf
+; X87-NEXT:    jbe .LBB58_2
+; X87-NEXT:  # %bb.1:
+; X87-NEXT:    fstp %st(0)
+; X87-NEXT:    fldz
+; X87-NEXT:    fxch %st(1)
+; X87-NEXT:  .LBB58_2:
+; X87-NEXT:    fstp %st(1)
+; X87-NEXT:    wait
+; X87-NEXT:    retl
+;
+; X87-CMOV-LABEL: fcmp_select_ule:
+; X87-CMOV:       # %bb.0:
+; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
+; X87-CMOV-NEXT:    fldl {{[0-9]+}}(%esp)
+; X87-CMOV-NEXT:    fcomi %st(1), %st
+; X87-CMOV-NEXT:    fxch %st(1)
+; X87-CMOV-NEXT:    fcmovbe %st(1), %st
+; X87-CMOV-NEXT:    fstp %st(1)
+; X87-CMOV-NEXT:    wait
+; X87-CMOV-NEXT:    retl
+  %cond = call i1 @llvm.experimental.constrained.fcmps.f64(
+                                               double %f1, double %f2, metadata !"ule",
+                                               metadata !"fpexcept.strict")
+  %res = select i1 %cond, double %f1, double %f2
+  ret double %res
+}
+
+attributes #0 = { nounwind strictfp }
 
 declare i1 @llvm.experimental.constrained.fcmp.f32(float, float, metadata, metadata)
 declare i1 @llvm.experimental.constrained.fcmp.f64(double, double, metadata, metadata)

From 0bc02b999a9686ba240b7a68d3f1cbbf037d2170 Mon Sep 17 00:00:00 2001
From: Younan Zhang <zyn7109@gmail.com>
Date: Fri, 11 Oct 2024 10:31:27 +0800
Subject: [PATCH 112/177] [Clang] Instantiate Typedefs referenced by type alias
 deduction guides (#111804)

TypedefNameDecl referenced by a synthesized CTAD guide for type aliases
was not transformed previously, resulting in a substitution failure in
BuildDeductionGuideForTypeAlias() when substituting into the
right-hand-side deduction guide.

This patch fixes it in the way we have been doing since
https://reviews.llvm.org/D80743. We transform all the function
parameters, parenting referenced TypedefNameDecls with the
CXXDeductionGuideDecl. Then we instantiate these declarations in
FindInstantiatedDecl() as we build up the eventual deduction guide,
using the mechanism introduced in D80743

Fixes #111508
---
 clang/lib/Sema/SemaTemplateDeductionGuide.cpp | 21 ++++++++++++++++---
 clang/test/SemaCXX/cxx20-ctad-type-alias.cpp  | 13 ++++++++++++
 2 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
index 545da21183c3c4..2d3e58548fb7ac 100644
--- a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
+++ b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
@@ -70,8 +70,8 @@ class ExtractTypeForDeductionGuide
   ExtractTypeForDeductionGuide(
       Sema &SemaRef,
       llvm::SmallVectorImpl<TypedefNameDecl *> &MaterializedTypedefs,
-      ClassTemplateDecl *NestedPattern,
-      const MultiLevelTemplateArgumentList *OuterInstantiationArgs)
+      ClassTemplateDecl *NestedPattern = nullptr,
+      const MultiLevelTemplateArgumentList *OuterInstantiationArgs = nullptr)
       : Base(SemaRef), MaterializedTypedefs(MaterializedTypedefs),
         NestedPattern(NestedPattern),
         OuterInstantiationArgs(OuterInstantiationArgs) {
@@ -1228,10 +1228,25 @@ FunctionTemplateDecl *DeclareAggregateDeductionGuideForTypeAlias(
       getRHSTemplateDeclAndArgs(SemaRef, AliasTemplate).first;
   if (!RHSTemplate)
     return nullptr;
+
+  llvm::SmallVector<TypedefNameDecl *> TypedefDecls;
+  llvm::SmallVector<QualType> NewParamTypes;
+  ExtractTypeForDeductionGuide TypeAliasTransformer(SemaRef, TypedefDecls);
+  for (QualType P : ParamTypes) {
+    QualType Type = TypeAliasTransformer.TransformType(P);
+    if (Type.isNull())
+      return nullptr;
+    NewParamTypes.push_back(Type);
+  }
+
   auto *RHSDeductionGuide = SemaRef.DeclareAggregateDeductionGuideFromInitList(
-      RHSTemplate, ParamTypes, Loc);
+      RHSTemplate, NewParamTypes, Loc);
   if (!RHSDeductionGuide)
     return nullptr;
+
+  for (TypedefNameDecl *TD : TypedefDecls)
+    TD->setDeclContext(RHSDeductionGuide->getTemplatedDecl());
+
   return BuildDeductionGuideForTypeAlias(SemaRef, AliasTemplate,
                                          RHSDeductionGuide, Loc);
 }
diff --git a/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp b/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp
index 5392573fcdb9d5..675c32a81f1ae8 100644
--- a/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp
+++ b/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp
@@ -481,3 +481,16 @@ struct Out {
 Out<float>::B out(100); // deduced to Out<float>::A<float>;
 static_assert(__is_same(decltype(out), Out<float>::A<float>));
 }
+
+namespace GH111508 {
+
+template <typename V> struct S {
+  using T = V;
+  T Data;
+};
+
+template <typename V> using Alias = S<V>;
+
+Alias A(42);
+
+} // namespace GH111508

From ec3e0a5900894c82e1763aa8597f47111edf6246 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Fri, 11 Oct 2024 11:08:07 +0800
Subject: [PATCH 113/177] Revert "[CodeGenPrepare] Convert `ctpop(X) ==/!= 1`
 into `ctpop(X) u</u> 2/1`" (#111932)

Reverts llvm/llvm-project#111284 to fix clang stage2 builds.
Investigating...

Failed buildbots:
https://lab.llvm.org/buildbot/#/builders/76/builds/3576
https://lab.llvm.org/buildbot/#/builders/168/builds/4308
https://lab.llvm.org/buildbot/#/builders/127/builds/1087
---
 llvm/lib/CodeGen/CodeGenPrepare.cpp       | 28 --------
 llvm/test/CodeGen/AArch64/arm64-popcnt.ll | 68 ++-----------------
 llvm/test/CodeGen/RISCV/rv32zbb.ll        | 39 -----------
 llvm/test/CodeGen/RISCV/rv64zbb.ll        | 81 -----------------------
 llvm/test/CodeGen/X86/ispow2.ll           | 45 +------------
 llvm/test/CodeGen/X86/known-never-zero.ll | 12 ++--
 6 files changed, 15 insertions(+), 258 deletions(-)

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 86f28293ba9ff8..3e09fbad6ab198 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -2111,31 +2111,6 @@ bool CodeGenPrepare::optimizeURem(Instruction *Rem) {
   return false;
 }
 
-/// Some targets have better codegen for `ctpop(X) u< 2` than `ctpop(X) == 1`.
-/// This function converts `ctpop(X) ==/!= 1` into `ctpop(X) u</u> 2/1` if the
-/// result cannot be zero.
-static bool adjustIsPower2Test(CmpInst *Cmp, const TargetLowering &TLI,
-                               const TargetTransformInfo &TTI,
-                               const DataLayout &DL) {
-  ICmpInst::Predicate Pred;
-  if (!match(Cmp, m_ICmp(Pred, m_Intrinsic<Intrinsic::ctpop>(), m_One())))
-    return false;
-  if (!ICmpInst::isEquality(Pred))
-    return false;
-  auto *II = cast<IntrinsicInst>(Cmp->getOperand(0));
-
-  if (isKnownNonZero(II, DL)) {
-    if (Pred == ICmpInst::ICMP_EQ) {
-      Cmp->setOperand(1, ConstantInt::get(II->getType(), 2));
-      Cmp->setPredicate(ICmpInst::ICMP_ULT);
-    } else {
-      Cmp->setPredicate(ICmpInst::ICMP_UGT);
-    }
-    return true;
-  }
-  return false;
-}
-
 bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
   if (sinkCmpExpression(Cmp, *TLI))
     return true;
@@ -2155,9 +2130,6 @@ bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
   if (foldFCmpToFPClassTest(Cmp, *TLI, *DL))
     return true;
 
-  if (adjustIsPower2Test(Cmp, *TLI, *TTI, *DL))
-    return true;
-
   return false;
 }
 
diff --git a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
index 0030e9ce80abb4..f5ce73a366125b 100644
--- a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
@@ -15,7 +15,7 @@ define i32 @cnt32_advsimd(i32 %x) nounwind readnone {
 ; CHECK-NONEON-LABEL: cnt32_advsimd:
 ; CHECK-NONEON:       // %bb.0:
 ; CHECK-NONEON-NEXT:    lsr w9, w0, #1
-; CHECK-NONEON-NEXT:    mov w8, #16843009 // =0x1010101
+; CHECK-NONEON-NEXT:    mov w8, #16843009
 ; CHECK-NONEON-NEXT:    and w9, w9, #0x55555555
 ; CHECK-NONEON-NEXT:    sub w9, w0, w9
 ; CHECK-NONEON-NEXT:    lsr w10, w9, #2
@@ -50,7 +50,7 @@ define i32 @cnt32_advsimd_2(<2 x i32> %x) {
 ; CHECK-NONEON-LABEL: cnt32_advsimd_2:
 ; CHECK-NONEON:       // %bb.0:
 ; CHECK-NONEON-NEXT:    lsr w9, w0, #1
-; CHECK-NONEON-NEXT:    mov w8, #16843009 // =0x1010101
+; CHECK-NONEON-NEXT:    mov w8, #16843009
 ; CHECK-NONEON-NEXT:    and w9, w9, #0x55555555
 ; CHECK-NONEON-NEXT:    sub w9, w0, w9
 ; CHECK-NONEON-NEXT:    lsr w10, w9, #2
@@ -86,7 +86,7 @@ define i64 @cnt64_advsimd(i64 %x) nounwind readnone {
 ; CHECK-NONEON-LABEL: cnt64_advsimd:
 ; CHECK-NONEON:       // %bb.0:
 ; CHECK-NONEON-NEXT:    lsr x9, x0, #1
-; CHECK-NONEON-NEXT:    mov x8, #72340172838076673 // =0x101010101010101
+; CHECK-NONEON-NEXT:    mov x8, #72340172838076673
 ; CHECK-NONEON-NEXT:    and x9, x9, #0x5555555555555555
 ; CHECK-NONEON-NEXT:    sub x9, x0, x9
 ; CHECK-NONEON-NEXT:    lsr x10, x9, #2
@@ -114,7 +114,7 @@ define i32 @cnt32(i32 %x) nounwind readnone noimplicitfloat {
 ; CHECK-LABEL: cnt32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    lsr w9, w0, #1
-; CHECK-NEXT:    mov w8, #16843009 // =0x1010101
+; CHECK-NEXT:    mov w8, #16843009
 ; CHECK-NEXT:    and w9, w9, #0x55555555
 ; CHECK-NEXT:    sub w9, w0, w9
 ; CHECK-NEXT:    lsr w10, w9, #2
@@ -130,7 +130,7 @@ define i32 @cnt32(i32 %x) nounwind readnone noimplicitfloat {
 ; CHECK-NONEON-LABEL: cnt32:
 ; CHECK-NONEON:       // %bb.0:
 ; CHECK-NONEON-NEXT:    lsr w9, w0, #1
-; CHECK-NONEON-NEXT:    mov w8, #16843009 // =0x1010101
+; CHECK-NONEON-NEXT:    mov w8, #16843009
 ; CHECK-NONEON-NEXT:    and w9, w9, #0x55555555
 ; CHECK-NONEON-NEXT:    sub w9, w0, w9
 ; CHECK-NONEON-NEXT:    lsr w10, w9, #2
@@ -155,7 +155,7 @@ define i64 @cnt64(i64 %x) nounwind readnone noimplicitfloat {
 ; CHECK-LABEL: cnt64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    lsr x9, x0, #1
-; CHECK-NEXT:    mov x8, #72340172838076673 // =0x101010101010101
+; CHECK-NEXT:    mov x8, #72340172838076673
 ; CHECK-NEXT:    and x9, x9, #0x5555555555555555
 ; CHECK-NEXT:    sub x9, x0, x9
 ; CHECK-NEXT:    lsr x10, x9, #2
@@ -171,7 +171,7 @@ define i64 @cnt64(i64 %x) nounwind readnone noimplicitfloat {
 ; CHECK-NONEON-LABEL: cnt64:
 ; CHECK-NONEON:       // %bb.0:
 ; CHECK-NONEON-NEXT:    lsr x9, x0, #1
-; CHECK-NONEON-NEXT:    mov x8, #72340172838076673 // =0x101010101010101
+; CHECK-NONEON-NEXT:    mov x8, #72340172838076673
 ; CHECK-NONEON-NEXT:    and x9, x9, #0x5555555555555555
 ; CHECK-NONEON-NEXT:    sub x9, x0, x9
 ; CHECK-NONEON-NEXT:    lsr x10, x9, #2
@@ -278,59 +278,5 @@ define i1 @ctpop32_ne_one(i32 %x) nounwind readnone {
   ret i1 %cmp
 }
 
-define i1 @ctpop32_eq_one_nonzero(i32 %x) {
-; CHECK-LABEL: ctpop32_eq_one_nonzero:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub w8, w0, #1
-; CHECK-NEXT:    tst w0, w8
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-;
-; CHECK-NONEON-LABEL: ctpop32_eq_one_nonzero:
-; CHECK-NONEON:       // %bb.0: // %entry
-; CHECK-NONEON-NEXT:    sub w8, w0, #1
-; CHECK-NONEON-NEXT:    tst w0, w8
-; CHECK-NONEON-NEXT:    cset w0, eq
-; CHECK-NONEON-NEXT:    ret
-;
-; CHECK-CSSC-LABEL: ctpop32_eq_one_nonzero:
-; CHECK-CSSC:       // %bb.0: // %entry
-; CHECK-CSSC-NEXT:    sub w8, w0, #1
-; CHECK-CSSC-NEXT:    tst w0, w8
-; CHECK-CSSC-NEXT:    cset w0, eq
-; CHECK-CSSC-NEXT:    ret
-entry:
-  %popcnt = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x)
-  %cmp = icmp eq i32 %popcnt, 1
-  ret i1 %cmp
-}
-
-define i1 @ctpop32_ne_one_nonzero(i32 %x) {
-; CHECK-LABEL: ctpop32_ne_one_nonzero:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub w8, w0, #1
-; CHECK-NEXT:    tst w0, w8
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-;
-; CHECK-NONEON-LABEL: ctpop32_ne_one_nonzero:
-; CHECK-NONEON:       // %bb.0: // %entry
-; CHECK-NONEON-NEXT:    sub w8, w0, #1
-; CHECK-NONEON-NEXT:    tst w0, w8
-; CHECK-NONEON-NEXT:    cset w0, ne
-; CHECK-NONEON-NEXT:    ret
-;
-; CHECK-CSSC-LABEL: ctpop32_ne_one_nonzero:
-; CHECK-CSSC:       // %bb.0: // %entry
-; CHECK-CSSC-NEXT:    sub w8, w0, #1
-; CHECK-CSSC-NEXT:    tst w0, w8
-; CHECK-CSSC-NEXT:    cset w0, ne
-; CHECK-CSSC-NEXT:    ret
-entry:
-  %popcnt = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x)
-  %cmp = icmp ne i32 %popcnt, 1
-  ret i1 %cmp
-}
-
 declare i32 @llvm.ctpop.i32(i32) nounwind readnone
 declare i64 @llvm.ctpop.i64(i64) nounwind readnone
diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll
index 4c52047b928f4d..e24b1b41645cdf 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll
@@ -1441,42 +1441,3 @@ define i32 @srai_slli2(i16 signext %0) {
   %3 = sext i16 %sext to i32
   ret i32 %3
 }
-
-define i1 @ctpop32_eq_one_nonzero(i32 %x) {
-; RV32I-LABEL: ctpop32_eq_one_nonzero:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi a1, a0, -1
-; RV32I-NEXT:    and a0, a0, a1
-; RV32I-NEXT:    seqz a0, a0
-; RV32I-NEXT:    ret
-;
-; RV32ZBB-LABEL: ctpop32_eq_one_nonzero:
-; RV32ZBB:       # %bb.0: # %entry
-; RV32ZBB-NEXT:    cpop a0, a0
-; RV32ZBB-NEXT:    sltiu a0, a0, 2
-; RV32ZBB-NEXT:    ret
-entry:
-  %popcnt = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x)
-  %cmp = icmp eq i32 %popcnt, 1
-  ret i1 %cmp
-}
-
-define i1 @ctpop32_ne_one_nonzero(i32 %x) {
-; RV32I-LABEL: ctpop32_ne_one_nonzero:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi a1, a0, -1
-; RV32I-NEXT:    and a0, a0, a1
-; RV32I-NEXT:    snez a0, a0
-; RV32I-NEXT:    ret
-;
-; RV32ZBB-LABEL: ctpop32_ne_one_nonzero:
-; RV32ZBB:       # %bb.0: # %entry
-; RV32ZBB-NEXT:    cpop a0, a0
-; RV32ZBB-NEXT:    sltiu a0, a0, 2
-; RV32ZBB-NEXT:    xori a0, a0, 1
-; RV32ZBB-NEXT:    ret
-entry:
-  %popcnt = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x)
-  %cmp = icmp ne i32 %popcnt, 1
-  ret i1 %cmp
-}
diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll
index 1e7814d588e4c0..43a499806ab5ae 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll
@@ -1618,84 +1618,3 @@ entry:
   %5 = add nsw i32 %4, %0
   ret i32 %5
 }
-
-define i1 @ctpop32_eq_one_nonzero(i32 %x) {
-; RV64I-LABEL: ctpop32_eq_one_nonzero:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi a1, a0, -1
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    sext.w a0, a0
-; RV64I-NEXT:    seqz a0, a0
-; RV64I-NEXT:    ret
-;
-; RV64ZBB-LABEL: ctpop32_eq_one_nonzero:
-; RV64ZBB:       # %bb.0: # %entry
-; RV64ZBB-NEXT:    cpopw a0, a0
-; RV64ZBB-NEXT:    sltiu a0, a0, 2
-; RV64ZBB-NEXT:    ret
-entry:
-  %popcnt = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x)
-  %cmp = icmp eq i32 %popcnt, 1
-  ret i1 %cmp
-}
-
-define i1 @ctpop32_ne_one_nonzero(i32 %x) {
-; RV64I-LABEL: ctpop32_ne_one_nonzero:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi a1, a0, -1
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    sext.w a0, a0
-; RV64I-NEXT:    snez a0, a0
-; RV64I-NEXT:    ret
-;
-; RV64ZBB-LABEL: ctpop32_ne_one_nonzero:
-; RV64ZBB:       # %bb.0: # %entry
-; RV64ZBB-NEXT:    cpopw a0, a0
-; RV64ZBB-NEXT:    sltiu a0, a0, 2
-; RV64ZBB-NEXT:    xori a0, a0, 1
-; RV64ZBB-NEXT:    ret
-entry:
-  %popcnt = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x)
-  %cmp = icmp ne i32 %popcnt, 1
-  ret i1 %cmp
-}
-
-define i1 @ctpop64_eq_one_nonzero(i64 %x) {
-; RV64I-LABEL: ctpop64_eq_one_nonzero:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi a1, a0, -1
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    seqz a0, a0
-; RV64I-NEXT:    ret
-;
-; RV64ZBB-LABEL: ctpop64_eq_one_nonzero:
-; RV64ZBB:       # %bb.0: # %entry
-; RV64ZBB-NEXT:    cpop a0, a0
-; RV64ZBB-NEXT:    sltiu a0, a0, 2
-; RV64ZBB-NEXT:    ret
-entry:
-  %popcnt = call range(i64 1, 65) i64 @llvm.ctpop.i64(i64 %x)
-  %cmp = icmp eq i64 %popcnt, 1
-  ret i1 %cmp
-}
-
-define i1 @ctpop32_eq_one_maybezero(i32 %x) {
-; RV64I-LABEL: ctpop32_eq_one_maybezero:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addiw a1, a0, -1
-; RV64I-NEXT:    xor a0, a0, a1
-; RV64I-NEXT:    sext.w a0, a0
-; RV64I-NEXT:    sltu a0, a1, a0
-; RV64I-NEXT:    ret
-;
-; RV64ZBB-LABEL: ctpop32_eq_one_maybezero:
-; RV64ZBB:       # %bb.0: # %entry
-; RV64ZBB-NEXT:    cpopw a0, a0
-; RV64ZBB-NEXT:    addi a0, a0, -1
-; RV64ZBB-NEXT:    seqz a0, a0
-; RV64ZBB-NEXT:    ret
-entry:
-  %popcnt = call range(i32 0, 16) i32 @llvm.ctpop.i32(i32 %x)
-  %cmp = icmp eq i32 %popcnt, 1
-  ret i1 %cmp
-}
diff --git a/llvm/test/CodeGen/X86/ispow2.ll b/llvm/test/CodeGen/X86/ispow2.ll
index 649d257b28d762..8723432de8b6b0 100644
--- a/llvm/test/CodeGen/X86/ispow2.ll
+++ b/llvm/test/CodeGen/X86/ispow2.ll
@@ -102,7 +102,7 @@ define <4 x i1> @is_pow2_non_zero_4xv64(<4 x i64> %xin) {
 ; CHECK-AVX512:       # %bb.0:
 ; CHECK-AVX512-NEXT:    vporq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
 ; CHECK-AVX512-NEXT:    vpopcntq %ymm0, %ymm0
-; CHECK-AVX512-NEXT:    vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k1
+; CHECK-AVX512-NEXT:    vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k1
 ; CHECK-AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; CHECK-AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; CHECK-AVX512-NEXT:    vzeroupper
@@ -155,7 +155,7 @@ define <4 x i1> @neither_pow2_non_zero_4xv64(<4 x i64> %xin) {
 ; CHECK-AVX512:       # %bb.0:
 ; CHECK-AVX512-NEXT:    vporq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
 ; CHECK-AVX512-NEXT:    vpopcntq %ymm0, %ymm0
-; CHECK-AVX512-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k1
+; CHECK-AVX512-NEXT:    vpcmpneqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k1
 ; CHECK-AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; CHECK-AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
 ; CHECK-AVX512-NEXT:    vzeroupper
@@ -220,44 +220,3 @@ define <4 x i1> @neither_pow2_non_zero_4xv64_x_maybe_z(<4 x i64> %x) {
   %r = icmp ne <4 x i64> %cnt, <i64 1, i64 1, i64 1, i64 1>
   ret <4 x i1> %r
 }
-
-
-define i1 @ctpop32_eq_one_nonzero(i32 %x) {
-; CHECK-NOBMI-LABEL: ctpop32_eq_one_nonzero:
-; CHECK-NOBMI:       # %bb.0: # %entry
-; CHECK-NOBMI-NEXT:    # kill: def $edi killed $edi def $rdi
-; CHECK-NOBMI-NEXT:    leal -1(%rdi), %eax
-; CHECK-NOBMI-NEXT:    testl %eax, %edi
-; CHECK-NOBMI-NEXT:    sete %al
-; CHECK-NOBMI-NEXT:    retq
-;
-; CHECK-BMI2-LABEL: ctpop32_eq_one_nonzero:
-; CHECK-BMI2:       # %bb.0: # %entry
-; CHECK-BMI2-NEXT:    blsrl %edi, %eax
-; CHECK-BMI2-NEXT:    sete %al
-; CHECK-BMI2-NEXT:    retq
-entry:
-  %popcnt = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x)
-  %cmp = icmp eq i32 %popcnt, 1
-  ret i1 %cmp
-}
-
-define i1 @ctpop32_ne_one_nonzero(i32 %x) {
-; CHECK-NOBMI-LABEL: ctpop32_ne_one_nonzero:
-; CHECK-NOBMI:       # %bb.0: # %entry
-; CHECK-NOBMI-NEXT:    # kill: def $edi killed $edi def $rdi
-; CHECK-NOBMI-NEXT:    leal -1(%rdi), %eax
-; CHECK-NOBMI-NEXT:    testl %eax, %edi
-; CHECK-NOBMI-NEXT:    setne %al
-; CHECK-NOBMI-NEXT:    retq
-;
-; CHECK-BMI2-LABEL: ctpop32_ne_one_nonzero:
-; CHECK-BMI2:       # %bb.0: # %entry
-; CHECK-BMI2-NEXT:    blsrl %edi, %eax
-; CHECK-BMI2-NEXT:    setne %al
-; CHECK-BMI2-NEXT:    retq
-entry:
-  %popcnt = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x)
-  %cmp = icmp ne i32 %popcnt, 1
-  ret i1 %cmp
-}
diff --git a/llvm/test/CodeGen/X86/known-never-zero.ll b/llvm/test/CodeGen/X86/known-never-zero.ll
index 6c0aaeb451e14a..ac41a3fe6bb7e4 100644
--- a/llvm/test/CodeGen/X86/known-never-zero.ll
+++ b/llvm/test/CodeGen/X86/known-never-zero.ll
@@ -555,9 +555,9 @@ define <4 x i32> @smax_known_zero_vec(<4 x i32> %x, <4 x i32> %y) {
 ; X86-NEXT:    por %xmm2, %xmm0
 ; X86-NEXT:    pcmpeqd %xmm1, %xmm1
 ; X86-NEXT:    paddd %xmm0, %xmm1
-; X86-NEXT:    pand %xmm1, %xmm0
-; X86-NEXT:    pxor %xmm1, %xmm1
-; X86-NEXT:    pcmpeqd %xmm1, %xmm0
+; X86-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-NEXT:    pxor %xmm1, %xmm0
+; X86-NEXT:    pcmpgtd %xmm1, %xmm0
 ; X86-NEXT:    psrld $31, %xmm0
 ; X86-NEXT:    retl
 ;
@@ -566,10 +566,10 @@ define <4 x i32> @smax_known_zero_vec(<4 x i32> %x, <4 x i32> %y) {
 ; X64-NEXT:    vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; X64-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
-; X64-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X64-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-NEXT:    vpminud %xmm1, %xmm0, %xmm1
 ; X64-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
-; X64-NEXT:    vpsrld $31, %xmm0, %xmm0
+; X64-NEXT:    vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64-NEXT:    retq
   %z = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %x, <4 x i32> <i32 54, i32 23, i32 12, i32 1>)
   %r = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %z)

From 126ed16525c92af1025a86b582c087d213b47145 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 10 Oct 2024 20:30:04 -0700
Subject: [PATCH 114/177] [ARM] Fix formatting (NFC)

I'm about to post a PR in this area.
---
 llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index b042ee69edd26c..b151a0116a9c41 100644
--- a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -2531,7 +2531,7 @@ bool
 ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
   bool RetVal = false;
 
-  DenseMap<MachineInstr*, unsigned> MI2LocMap;
+  DenseMap<MachineInstr *, unsigned> MI2LocMap;
   using MapIt = DenseMap<unsigned, SmallVector<MachineInstr *, 4>>::iterator;
   using Base2InstMap = DenseMap<unsigned, SmallVector<MachineInstr *, 4>>;
   using BaseVec = SmallVector<unsigned, 4>;
@@ -2570,7 +2570,7 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
       Register Base = MI.getOperand(1).getReg();
       int Offset = getMemoryOpOffset(MI);
       bool StopHere = false;
-      auto FindBases = [&] (Base2InstMap &Base2Ops, BaseVec &Bases) {
+      auto FindBases = [&](Base2InstMap &Base2Ops, BaseVec &Bases) {
         MapIt BI = Base2Ops.find(Base);
         if (BI == Base2Ops.end()) {
           Base2Ops[Base].push_back(&MI);

From 51e9430a0c767243411d4b81c284700f89719277 Mon Sep 17 00:00:00 2001
From: lntue <lntue@google.com>
Date: Thu, 10 Oct 2024 23:33:02 -0400
Subject: [PATCH 115/177] [libc][math] Improve performance of double precision
 trig functions. (#111793)

- Improve the accuracy of fast pass' range reduction.
- Provide tighter error estimations.
- Reduce the table size when `LIBC_MATH_SMALL_TABLES` flag is set.
---
 libc/src/__support/FPUtil/double_double.h     |  43 ++-
 libc/src/__support/macros/optimization.h      |  10 +
 libc/src/math/generic/cos.cpp                 | 117 +++---
 libc/src/math/generic/pow.cpp                 |   2 +-
 .../generic/range_reduction_double_common.h   | 348 ++++++++++++------
 .../math/generic/range_reduction_double_fma.h | 254 +++----------
 .../generic/range_reduction_double_nofma.h    | 253 +++----------
 libc/src/math/generic/sin.cpp                 | 129 +++----
 libc/src/math/generic/sincos.cpp              | 155 ++++----
 libc/src/math/generic/sincos_eval.h           |  27 +-
 libc/src/math/generic/tan.cpp                 | 147 ++++----
 libc/test/src/math/cos_test.cpp               |   3 +-
 libc/test/src/math/sin_test.cpp               |  12 +-
 libc/test/src/math/tan_test.cpp               |  21 +-
 14 files changed, 666 insertions(+), 855 deletions(-)

diff --git a/libc/src/__support/FPUtil/double_double.h b/libc/src/__support/FPUtil/double_double.h
index 25a4ee03387c67..db3c2c8a3d7a6e 100644
--- a/libc/src/__support/FPUtil/double_double.h
+++ b/libc/src/__support/FPUtil/double_double.h
@@ -18,6 +18,8 @@
 namespace LIBC_NAMESPACE_DECL {
 namespace fputil {
 
+#define DEFAULT_DOUBLE_SPLIT 27
+
 using DoubleDouble = LIBC_NAMESPACE::NumberPair<double>;
 
 // The output of Dekker's FastTwoSum algorithm is correct, i.e.:
@@ -61,7 +63,8 @@ LIBC_INLINE constexpr DoubleDouble add(const DoubleDouble &a, double b) {
 //   Zimmermann, P., "Note on the Veltkamp/Dekker Algorithms with Directed
 //   Roundings," https://inria.hal.science/hal-04480440.
 // Default splitting constant = 2^ceil(prec(double)/2) + 1 = 2^27 + 1.
-template <size_t N = 27> LIBC_INLINE constexpr DoubleDouble split(double a) {
+template <size_t N = DEFAULT_DOUBLE_SPLIT>
+LIBC_INLINE constexpr DoubleDouble split(double a) {
   DoubleDouble r{0.0, 0.0};
   // CN = 2^N.
   constexpr double CN = static_cast<double>(1 << N);
@@ -73,6 +76,22 @@ template <size_t N = 27> LIBC_INLINE constexpr DoubleDouble split(double a) {
   return r;
 }
 
+// Helper for non-fma exact mult where the first number is already split.
+template <size_t SPLIT_B = DEFAULT_DOUBLE_SPLIT>
+LIBC_INLINE DoubleDouble exact_mult(const DoubleDouble &as, double a,
+                                    double b) {
+  DoubleDouble bs = split<SPLIT_B>(b);
+  DoubleDouble r{0.0, 0.0};
+
+  r.hi = a * b;
+  double t1 = as.hi * bs.hi - r.hi;
+  double t2 = as.hi * bs.lo + t1;
+  double t3 = as.lo * bs.hi + t2;
+  r.lo = as.lo * bs.lo + t3;
+
+  return r;
+}
+
 // Note: When FMA instruction is not available, the `exact_mult` function is
 // only correct for round-to-nearest mode.  See:
 //   Zimmermann, P., "Note on the Veltkamp/Dekker Algorithms with Directed
@@ -80,7 +99,7 @@ template <size_t N = 27> LIBC_INLINE constexpr DoubleDouble split(double a) {
 // Using Theorem 1 in the paper above, without FMA instruction, if we restrict
 // the generated constants to precision <= 51, and splitting it by 2^28 + 1,
 // then a * b = r.hi + r.lo is exact for all rounding modes.
-template <bool NO_FMA_ALL_ROUNDINGS = false>
+template <size_t SPLIT_B = 27>
 LIBC_INLINE DoubleDouble exact_mult(double a, double b) {
   DoubleDouble r{0.0, 0.0};
 
@@ -90,18 +109,8 @@ LIBC_INLINE DoubleDouble exact_mult(double a, double b) {
 #else
   // Dekker's Product.
   DoubleDouble as = split(a);
-  DoubleDouble bs;
 
-  if constexpr (NO_FMA_ALL_ROUNDINGS)
-    bs = split<28>(b);
-  else
-    bs = split(b);
-
-  r.hi = a * b;
-  double t1 = as.hi * bs.hi - r.hi;
-  double t2 = as.hi * bs.lo + t1;
-  double t3 = as.lo * bs.hi + t2;
-  r.lo = as.lo * bs.lo + t3;
+  r = exact_mult<SPLIT_B>(as, a, b);
 #endif // LIBC_TARGET_CPU_HAS_FMA
 
   return r;
@@ -113,10 +122,10 @@ LIBC_INLINE DoubleDouble quick_mult(double a, const DoubleDouble &b) {
   return r;
 }
 
-template <bool NO_FMA_ALL_ROUNDINGS = false>
+template <size_t SPLIT_B = 27>
 LIBC_INLINE DoubleDouble quick_mult(const DoubleDouble &a,
                                     const DoubleDouble &b) {
-  DoubleDouble r = exact_mult<NO_FMA_ALL_ROUNDINGS>(a.hi, b.hi);
+  DoubleDouble r = exact_mult<SPLIT_B>(a.hi, b.hi);
   double t1 = multiply_add(a.hi, b.lo, r.lo);
   double t2 = multiply_add(a.lo, b.hi, t1);
   r.lo = t2;
@@ -157,8 +166,8 @@ LIBC_INLINE DoubleDouble div(const DoubleDouble &a, const DoubleDouble &b) {
   double e_hi = fputil::multiply_add(b.hi, -r.hi, a.hi);
   double e_lo = fputil::multiply_add(b.lo, -r.hi, a.lo);
 #else
-  DoubleDouble b_hi_r_hi = fputil::exact_mult</*NO_FMA=*/true>(b.hi, -r.hi);
-  DoubleDouble b_lo_r_hi = fputil::exact_mult</*NO_FMA=*/true>(b.lo, -r.hi);
+  DoubleDouble b_hi_r_hi = fputil::exact_mult(b.hi, -r.hi);
+  DoubleDouble b_lo_r_hi = fputil::exact_mult(b.lo, -r.hi);
   double e_hi = (a.hi + b_hi_r_hi.hi) + b_hi_r_hi.lo;
   double e_lo = (a.lo + b_lo_r_hi.hi) + b_lo_r_hi.lo;
 #endif // LIBC_TARGET_CPU_HAS_FMA
diff --git a/libc/src/__support/macros/optimization.h b/libc/src/__support/macros/optimization.h
index 5ffd474d35c54d..41ecd2bd6d7191 100644
--- a/libc/src/__support/macros/optimization.h
+++ b/libc/src/__support/macros/optimization.h
@@ -48,6 +48,16 @@ LIBC_INLINE constexpr bool expects_bool_condition(T value, T expected) {
 
 #ifndef LIBC_MATH
 #define LIBC_MATH 0
+#else
+
+#if (LIBC_MATH & LIBC_MATH_SKIP_ACCURATE_PASS)
+#define LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#endif
+
+#if (LIBC_MATH & LIBC_MATH_SMALL_TABLES)
+#define LIBC_MATH_HAS_SMALL_TABLES
+#endif
+
 #endif // LIBC_MATH
 
 #endif // LLVM_LIBC_SRC___SUPPORT_MACROS_OPTIMIZATION_H
diff --git a/libc/src/math/generic/cos.cpp b/libc/src/math/generic/cos.cpp
index e61d800ce2dada..923ea96852d889 100644
--- a/libc/src/math/generic/cos.cpp
+++ b/libc/src/math/generic/cos.cpp
@@ -17,17 +17,14 @@
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h"            // LIBC_UNLIKELY
 #include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA
+#include "src/math/generic/range_reduction_double_common.h"
 #include "src/math/generic/sincos_eval.h"
 
-// TODO: We might be able to improve the performance of large range reduction of
-// non-FMA targets further by operating directly on 25-bit chunks of 128/pi and
-// pre-split SIN_K_PI_OVER_128, but that might double the memory footprint of
-// those lookup table.
-#include "range_reduction_double_common.h"
-
-#if ((LIBC_MATH & LIBC_MATH_SKIP_ACCURATE_PASS) != 0)
-#define LIBC_MATH_COS_SKIP_ACCURATE_PASS
-#endif
+#ifdef LIBC_TARGET_CPU_HAS_FMA
+#include "range_reduction_double_fma.h"
+#else
+#include "range_reduction_double_nofma.h"
+#endif // LIBC_TARGET_CPU_HAS_FMA
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -42,22 +39,29 @@ LLVM_LIBC_FUNCTION(double, cos, (double x)) {
 
   DoubleDouble y;
   unsigned k;
-  generic::LargeRangeReduction<NO_FMA> range_reduction_large{};
+  LargeRangeReduction range_reduction_large{};
 
-  // |x| < 2^32 (with FMA) or |x| < 2^23 (w/o FMA)
+  // |x| < 2^16.
   if (LIBC_LIKELY(x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT)) {
-    // |x| < 2^-27
-    if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 27)) {
-      // Signed zeros.
-      if (LIBC_UNLIKELY(x == 0.0))
-        return 1.0;
-
-      // For |x| < 2^-27, |cos(x) - 1| < |x|^2/2 < 2^-54 = ulp(1 - 2^-53)/2.
-      return fputil::round_result_slightly_down(1.0);
+    // |x| < 2^-7
+    if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 7)) {
+      // |x| < 2^-27
+      if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 27)) {
+        // Signed zeros.
+        if (LIBC_UNLIKELY(x == 0.0))
+          return 1.0;
+
+        // For |x| < 2^-27, |cos(x) - 1| < |x|^2/2 < 2^-54 = ulp(1 - 2^-53)/2.
+        return fputil::round_result_slightly_down(1.0);
+      }
+      // No range reduction needed.
+      k = 0;
+      y.lo = 0.0;
+      y.hi = x;
+    } else {
+      // Small range reduction.
+      k = range_reduction_small(x, y);
     }
-
-    // // Small range reduction.
-    k = range_reduction_small(x, y);
   } else {
     // Inf or NaN
     if (LIBC_UNLIKELY(x_e > 2 * FPBits::EXP_BIAS)) {
@@ -70,70 +74,51 @@ LLVM_LIBC_FUNCTION(double, cos, (double x)) {
     }
 
     // Large range reduction.
-    k = range_reduction_large.compute_high_part(x);
-    y = range_reduction_large.fast();
+    k = range_reduction_large.fast(x, y);
   }
 
   DoubleDouble sin_y, cos_y;
 
-  generic::sincos_eval(y, sin_y, cos_y);
+  [[maybe_unused]] double err = generic::sincos_eval(y, sin_y, cos_y);
 
   // Look up sin(k * pi/128) and cos(k * pi/128)
-  // Memory saving versions:
-
-  // Use 128-entry table instead:
-  // DoubleDouble sin_k = SIN_K_PI_OVER_128[k & 127];
-  // uint64_t sin_s = static_cast<uint64_t>((k + 128) & 128) << (63 - 7);
-  // sin_k.hi = FPBits(FPBits(sin_k.hi).uintval() ^ sin_s).get_val();
-  // sin_k.lo = FPBits(FPBits(sin_k.hi).uintval() ^ sin_s).get_val();
-  // DoubleDouble cos_k = SIN_K_PI_OVER_128[(k + 64) & 127];
-  // uint64_t cos_s = static_cast<uint64_t>((k + 64) & 128) << (63 - 7);
-  // cos_k.hi = FPBits(FPBits(cos_k.hi).uintval() ^ cos_s).get_val();
-  // cos_k.lo = FPBits(FPBits(cos_k.hi).uintval() ^ cos_s).get_val();
-
-  // Use 64-entry table instead:
-  // auto get_idx_dd = [](unsigned kk) -> DoubleDouble {
-  //   unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63);
-  //   DoubleDouble ans = SIN_K_PI_OVER_128[idx];
-  //   if (kk & 128) {
-  //     ans.hi = -ans.hi;
-  //     ans.lo = -ans.lo;
-  //   }
-  //   return ans;
-  // };
-  // DoubleDouble sin_k = get_idx_dd(k + 128);
-  // DoubleDouble cos_k = get_idx_dd(k + 64);
-
+#ifdef LIBC_MATH_HAS_SMALL_TABLES
+  // Memory saving versions.  Use 65-entry table.
+  auto get_idx_dd = [](unsigned kk) -> DoubleDouble {
+    unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63);
+    DoubleDouble ans = SIN_K_PI_OVER_128[idx];
+    if (kk & 128) {
+      ans.hi = -ans.hi;
+      ans.lo = -ans.lo;
+    }
+    return ans;
+  };
+  DoubleDouble sin_k = get_idx_dd(k + 128);
+  DoubleDouble cos_k = get_idx_dd(k + 64);
+#else
   // Fast look up version, but needs 256-entry table.
   // -sin(k * pi/128) = sin((k + 128) * pi/128)
   // cos(k * pi/128) = sin(k * pi/128 + pi/2) = sin((k + 64) * pi/128).
   DoubleDouble msin_k = SIN_K_PI_OVER_128[(k + 128) & 255];
   DoubleDouble cos_k = SIN_K_PI_OVER_128[(k + 64) & 255];
+#endif // LIBC_MATH_HAS_SMALL_TABLES
 
   // After range reduction, k = round(x * 128 / pi) and y = x - k * (pi / 128).
   // So k is an integer and -pi / 256 <= y <= pi / 256.
   // Then cos(x) = cos((k * pi/128 + y)
   //             = cos(y) * cos(k*pi/128) - sin(y) * sin(k*pi/128)
-  DoubleDouble cos_k_cos_y = fputil::quick_mult<NO_FMA>(cos_y, cos_k);
-  DoubleDouble msin_k_sin_y = fputil::quick_mult<NO_FMA>(sin_y, msin_k);
+  DoubleDouble cos_k_cos_y = fputil::quick_mult(cos_y, cos_k);
+  DoubleDouble msin_k_sin_y = fputil::quick_mult(sin_y, msin_k);
 
   DoubleDouble rr = fputil::exact_add<false>(cos_k_cos_y.hi, msin_k_sin_y.hi);
   rr.lo += msin_k_sin_y.lo + cos_k_cos_y.lo;
 
-#ifdef LIBC_MATH_COS_SKIP_ACCURATE_PASS
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
   return rr.hi + rr.lo;
 #else
 
-  // Accurate test and pass for correctly rounded implementation.
-#ifdef LIBC_TARGET_CPU_HAS_FMA
-  constexpr double ERR = 0x1.0p-70;
-#else
-  // TODO: Improve non-FMA fast pass accuracy.
-  constexpr double ERR = 0x1.0p-66;
-#endif // LIBC_TARGET_CPU_HAS_FMA
-
-  double rlp = rr.lo + ERR;
-  double rlm = rr.lo - ERR;
+  double rlp = rr.lo + err;
+  double rlm = rr.lo - err;
 
   double r_upper = rr.hi + rlp; // (rr.lo + ERR);
   double r_lower = rr.hi + rlm; // (rr.lo - ERR);
@@ -144,7 +129,7 @@ LLVM_LIBC_FUNCTION(double, cos, (double x)) {
 
   Float128 u_f128, sin_u, cos_u;
   if (LIBC_LIKELY(x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT))
-    u_f128 = generic::range_reduction_small_f128(x);
+    u_f128 = range_reduction_small_f128(x);
   else
     u_f128 = range_reduction_large.accurate();
 
@@ -152,7 +137,7 @@ LLVM_LIBC_FUNCTION(double, cos, (double x)) {
 
   auto get_sin_k = [](unsigned kk) -> Float128 {
     unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63);
-    Float128 ans = generic::SIN_K_PI_OVER_128_F128[idx];
+    Float128 ans = SIN_K_PI_OVER_128_F128[idx];
     if (kk & 128)
       ans.sign = Sign::NEG;
     return ans;
@@ -172,7 +157,7 @@ LLVM_LIBC_FUNCTION(double, cos, (double x)) {
   // https://github.com/llvm/llvm-project/issues/96452.
 
   return static_cast<double>(r);
-#endif // !LIBC_MATH_COS_SKIP_ACCURATE_PASS
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/pow.cpp b/libc/src/math/generic/pow.cpp
index 3a50e220154e51..181d3d40b3c9ad 100644
--- a/libc/src/math/generic/pow.cpp
+++ b/libc/src/math/generic/pow.cpp
@@ -398,7 +398,7 @@ LLVM_LIBC_FUNCTION(double, pow, (double x, double y)) {
 #else
   double c = FPBits(m_x.uintval() & 0x3fff'e000'0000'0000).get_val();
   dx = fputil::multiply_add(RD[idx_x], m_x.get_val() - c, CD[idx_x]); // Exact
-  dx_c0 = fputil::exact_mult<true>(COEFFS[0], dx);
+  dx_c0 = fputil::exact_mult<28>(dx, COEFFS[0]);                      // Exact
 #endif // LIBC_TARGET_CPU_HAS_FMA
 
   double dx2 = dx * dx;
diff --git a/libc/src/math/generic/range_reduction_double_common.h b/libc/src/math/generic/range_reduction_double_common.h
index 290b642be4c69f..e23bbff144bee8 100644
--- a/libc/src/math/generic/range_reduction_double_common.h
+++ b/libc/src/math/generic/range_reduction_double_common.h
@@ -17,150 +17,272 @@
 #include "src/__support/common.h"
 #include "src/__support/integer_literals.h"
 #include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h"
 
-#ifdef LIBC_TARGET_CPU_HAS_FMA
-#include "range_reduction_double_fma.h"
-
-// With FMA, we limit the maxmimum exponent to be 2^16, so that the error bound
-// from the fma::range_reduction_small is bounded by 2^-88 instead of 2^-72.
-#define FAST_PASS_EXPONENT 16
-using LIBC_NAMESPACE::fma::ONE_TWENTY_EIGHT_OVER_PI;
-using LIBC_NAMESPACE::fma::range_reduction_small;
-using LIBC_NAMESPACE::fma::SIN_K_PI_OVER_128;
+namespace LIBC_NAMESPACE_DECL {
 
-LIBC_INLINE constexpr bool NO_FMA = false;
+#ifdef LIBC_TARGET_CPU_HAS_FMA
+static constexpr unsigned SPLIT = DEFAULT_DOUBLE_SPLIT;
 #else
-#include "range_reduction_double_nofma.h"
+// When there is no-FMA instructions, in order to have exact product of 2 double
+// precision with directional roundings, we need to lower the precision of the
+// constants by at least 1 bit, and use a different splitting constant.
+static constexpr unsigned SPLIT = 28;
+#endif // LIBC_TARGET_CPU_HAS_FMA
 
-using LIBC_NAMESPACE::nofma::FAST_PASS_EXPONENT;
-using LIBC_NAMESPACE::nofma::ONE_TWENTY_EIGHT_OVER_PI;
-using LIBC_NAMESPACE::nofma::range_reduction_small;
-using LIBC_NAMESPACE::nofma::SIN_K_PI_OVER_128;
+using LIBC_NAMESPACE::fputil::DoubleDouble;
+using Float128 = LIBC_NAMESPACE::fputil::DyadicFloat<128>;
 
-LIBC_INLINE constexpr bool NO_FMA = true;
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#define FAST_PASS_EXPONENT 16
 
-namespace LIBC_NAMESPACE_DECL {
+// For 2^-7 < |x| < 2^16, return k and u such that:
+//   k = round(x * 128/pi)
+//   x mod pi/128 = x - k * pi/128 ~ u.hi + u.lo
+// Error bound:
+//   |(x - k * pi/128) - (u_hi + u_lo)| <= max(ulp(ulp(u_hi)), 2^-119)
+//                                      <= 2^-111.
+LIBC_INLINE unsigned range_reduction_small(double x, DoubleDouble &u) {
+  // Values of -pi/128 used for inputs with absolute value <= 2^16.
+  // The first 3 parts are generated with (53 - 21 = 32)-bit precision, so that
+  // the product k * MPI_OVER_128[i] is exact.
+  // Generated by Sollya with:
+  // > display = hexadecimal!;
+  // > a = round(pi/128, 32, RN);
+  // > b = round(pi/128 - a, 32, RN);
+  // > c = round(pi/128 - a - b, D, RN);
+  // > print(-a, ",", -b, ",", -c);
+  constexpr double MPI_OVER_128[3] = {-0x1.921fb544p-6, -0x1.0b4611a6p-40,
+                                      -0x1.3198a2e037073p-75};
+  constexpr double ONE_TWENTY_EIGHT_OVER_PI_D = 0x1.45f306dc9c883p5;
+  double prod_hi = x * ONE_TWENTY_EIGHT_OVER_PI_D;
+  double kd = fputil::nearest_integer(prod_hi);
 
-namespace generic {
+  // Let y = x - k * (pi/128)
+  // Then |y| < pi / 256
+  // With extra rounding errors, we can bound |y| < 1.6 * 2^-7.
+  double y_hi = fputil::multiply_add(kd, MPI_OVER_128[0], x); // Exact
+  // |u.hi| < 1.6*2^-7
+  u.hi = fputil::multiply_add(kd, MPI_OVER_128[1], y_hi);
+  double u0 = y_hi - u.hi; // Exact
+  // |u.lo| <= max(ulp(u.hi), |kd * MPI_OVER_128[2]|)
+  double u1 = fputil::multiply_add(kd, MPI_OVER_128[1], u0); // Exact
+  u.lo = fputil::multiply_add(kd, MPI_OVER_128[2], u1);
+  // Error bound:
+  // |x - k * pi/128| - (u.hi + u.lo) <= ulp(u.lo)
+  //                                  <= ulp(max(ulp(u.hi), kd*MPI_OVER_128[2]))
+  //                                  <= 2^(-7 - 104) = 2^-111.
 
-using LIBC_NAMESPACE::fputil::DoubleDouble;
-using Float128 = LIBC_NAMESPACE::fputil::DyadicFloat<128>;
+  return static_cast<unsigned>(static_cast<int64_t>(kd));
+}
 
-LIBC_INLINE constexpr Float128 PI_OVER_128_F128 = {
-    Sign::POS, -133, 0xc90f'daa2'2168'c234'c4c6'628b'80dc'1cd1_u128};
+// Digits of 2^(16*i) / pi, generated by Sollya with:
+// > procedure ulp(x, n) { return 2^(floor(log2(abs(x))) - n); };
+// > for i from 0 to 63 do {
+//     if i < 3 then { pi_inv = 0.25 + 2^(16*(i - 3)) / pi; }
+//     else { pi_inv = 2^(16*(i-3)) / pi; };
+//     pn = nearestint(pi_inv);
+//     pi_frac = pi_inv - pn;
+//     a = round(pi_frac, 51, RN);
+//     b = round(pi_frac - a, 51, RN);
+//     c = round(pi_frac - a - b, 51, RN);
+//     d = round(pi_frac - a - b - c, D, RN);
+//     print("{", 2^7 * a, ",", 2^7 * b, ",", 2^7 * c, ",", 2^7 * d, "},");
+// };
+//
+// Notice that for [0..2] the leading bit of 2^(16*(i - 3)) / pi is very small,
+// so we add 0.25 so that the conditions for the algorithms are still satisfied,
+// and one of those conditions guarantees that ulp(0.25 * x_reduced) >= 2, and
+// will safely be discarded.
 
-// Note: The look-up tables ONE_TWENTY_EIGHT_OVER_PI is selected to be either
-// from fma:: or nofma:: namespace.
+static constexpr double ONE_TWENTY_EIGHT_OVER_PI[64][4] = {
+    {0x1.0000000000014p5, 0x1.7cc1b727220a8p-49, 0x1.4fe13abe8fa9cp-101,
+     -0x1.911f924eb5336p-153},
+    {0x1.0000000145f3p5, 0x1.b727220a94fep-49, 0x1.3abe8fa9a6eep-101,
+     0x1.b6c52b3278872p-155},
+    {0x1.000145f306dc8p5, 0x1.c882a53f84ebp-47, -0x1.70565911f925p-101,
+     0x1.4acc9e21c821p-153},
+    {0x1.45f306dc9c884p5, -0x1.5ac07b1505c14p-47, -0x1.96447e493ad4cp-99,
+     -0x1.b0ef1bef806bap-152},
+    {-0x1.f246c6efab58p4, -0x1.ec5417056591p-49, -0x1.f924eb53361ep-101,
+     0x1.c820ff28b1d5fp-153},
+    {0x1.391054a7f09d4p4, 0x1.f47d4d377036cp-48, 0x1.8a5664f10e41p-100,
+     0x1.fe5163abdebbcp-154},
+    {0x1.529fc2757d1f4p2, 0x1.34ddc0db62958p-50, 0x1.93c439041fe5p-102,
+     0x1.63abdebbc561bp-154},
+    {-0x1.ec5417056591p-1, -0x1.f924eb53361ep-53, 0x1.c820ff28b1d6p-105,
+     -0x1.0a21d4f246dc9p-157},
+    {-0x1.505c1596447e4p5, -0x1.275a99b0ef1cp-48, 0x1.07f9458eaf7bp-100,
+     -0x1.0ea79236e4717p-152},
+    {-0x1.596447e493ad4p1, -0x1.9b0ef1bef806cp-52, 0x1.63abdebbc561cp-106,
+     -0x1.1b7238b7b645ap-159},
+    {0x1.bb81b6c52b328p5, -0x1.de37df00d74e4p-49, 0x1.5ef5de2b0db94p-101,
+     -0x1.c8e2ded9169p-153},
+    {0x1.b6c52b3278874p5, -0x1.f7c035d38a844p-47, 0x1.778ac36e48dc8p-99,
+     -0x1.6f6c8b47fe6dbp-152},
+    {0x1.2b3278872084p5, -0x1.ae9c5421443a8p-50, -0x1.e48db91c5bdb4p-102,
+     0x1.d2e006492eea1p-154},
+    {-0x1.8778df7c035d4p5, 0x1.d5ef5de2b0db8p-49, 0x1.2371d2126e97p-101,
+     0x1.924bba8274648p-160},
+    {-0x1.bef806ba71508p4, -0x1.443a9e48db91cp-50, -0x1.6f6c8b47fe6dcp-104,
+     0x1.77504e8c90e7fp-157},
+    {-0x1.ae9c5421443a8p-2, -0x1.e48db91c5bdb4p-54, 0x1.d2e006492eeap-106,
+     0x1.3a32439fc3bd6p-159},
+    {-0x1.38a84288753c8p5, -0x1.1b7238b7b645cp-47, 0x1.c00c925dd413cp-99,
+     -0x1.cdbc603c429c7p-151},
+    {-0x1.0a21d4f246dc8p3, -0x1.c5bdb22d1ff9cp-50, 0x1.25dd413a32438p-103,
+     0x1.fc3bd63962535p-155},
+    {-0x1.d4f246dc8e2ep3, 0x1.26e9700324978p-49, -0x1.5f62e6de301e4p-102,
+     0x1.eb1cb129a73efp-154},
+    {-0x1.236e4716f6c8cp4, 0x1.700324977505p-49, -0x1.736f180f10a7p-101,
+     -0x1.a76b2c608bbeep-153},
+    {0x1.b8e909374b8p4, 0x1.924bba8274648p-48, 0x1.cfe1deb1cb128p-102,
+     0x1.a73ee88235f53p-154},
+    {0x1.09374b801924cp4, -0x1.15f62e6de302p-50, 0x1.deb1cb129a74p-102,
+     -0x1.177dca0ad144cp-154},
+    {-0x1.68ffcdb688afcp3, 0x1.d1921cfe1debp-50, 0x1.cb129a73ee884p-102,
+     -0x1.ca0ad144bb7b1p-154},
+    {0x1.924bba8274648p0, 0x1.cfe1deb1cb128p-54, 0x1.a73ee88235f54p-106,
+     -0x1.144bb7b16639p-158},
+    {-0x1.a22bec5cdbc6p5, -0x1.e214e34ed658cp-50, -0x1.177dca0ad144cp-106,
+     0x1.213a671c09ad1p-160},
+    {0x1.3a32439fc3bd8p1, -0x1.c69dacb1822fp-51, 0x1.1afa975da2428p-105,
+     -0x1.6638fd94ba082p-158},
+    {-0x1.b78c0788538d4p4, 0x1.29a73ee88236p-50, -0x1.5a28976f62cc8p-103,
+     0x1.c09ad17df904ep-156},
+    {0x1.fc3bd63962534p5, 0x1.cfba208d7d4bcp-48, -0x1.12edec598e3f8p-100,
+     0x1.ad17df904e647p-152},
+    {-0x1.4e34ed658c118p2, 0x1.046bea5d7689p-51, 0x1.3a671c09ad17cp-104,
+     0x1.f904e64758e61p-156},
+    {0x1.62534e7dd1048p5, -0x1.415a28976f62cp-47, -0x1.8e3f652e8207p-100,
+     0x1.3991d63983534p-154},
+    {-0x1.63045df7282b4p4, -0x1.44bb7b16638fcp-50, -0x1.94ba081bec67p-102,
+     0x1.d639835339f4ap-154},
+    {0x1.d1046bea5d768p5, 0x1.213a671c09adp-48, 0x1.7df904e64759p-100,
+     -0x1.9f2b3182d8defp-152},
+    {0x1.afa975da24274p3, 0x1.9c7026b45f7e4p-50, 0x1.3991d63983534p-106,
+     -0x1.82d8dee81d108p-160},
+    {-0x1.a28976f62cc7p5, -0x1.fb29741037d8cp-47, -0x1.b8a719f2b3184p-100,
+     0x1.272117e2ef7e5p-152},
+    {-0x1.76f62cc71fb28p5, -0x1.741037d8cdc54p-47, 0x1.cc1a99cfa4e44p-101,
+     -0x1.d03a21036be27p-153},
+    {0x1.d338e04d68bfp5, -0x1.bec66e29c67ccp-50, 0x1.339f49c845f8cp-102,
+     -0x1.081b5f13801dap-156},
+    {0x1.c09ad17df905p4, -0x1.9b8a719f2b318p-48, -0x1.6c6f740e8840cp-103,
+     -0x1.af89c00ed0004p-155},
+    {0x1.68befc827323cp5, -0x1.38cf9598c16c8p-47, 0x1.08bf177bf2508p-99,
+     -0x1.3801da00087eap-152},
+    {-0x1.037d8cdc538dp5, 0x1.a99cfa4e422fcp-49, 0x1.77bf250763ffp-103,
+     0x1.2fffbc0b301fep-155},
+    {-0x1.8cdc538cf9598p5, -0x1.82d8dee81d108p-48, -0x1.b5f13801dap-104,
+     -0x1.0fd33f8086877p-157},
+    {-0x1.4e33e566305bp3, -0x1.bdd03a21036cp-49, 0x1.d8ffc4bffef04p-101,
+     -0x1.33f80868773a5p-153},
+    {-0x1.f2b3182d8dee8p4, -0x1.d1081b5f138p-52, -0x1.da00087e99fcp-104,
+     -0x1.0d0ee74a5f593p-158},
+    {-0x1.8c16c6f740e88p5, -0x1.036be27003b4p-49, -0x1.0fd33f8086878p-109,
+     0x1.8b5a0a6d1f6d3p-162},
+    {0x1.3908bf177bf24p5, 0x1.0763ff12fffbcp-47, 0x1.6603fbcbc462cp-104,
+     0x1.6829b47db4dap-156},
+    {0x1.7e2ef7e4a0ec8p4, -0x1.da00087e99fcp-56, -0x1.0d0ee74a5f594p-110,
+     0x1.1f6d367ecf27dp-162},
+    {-0x1.081b5f13801dcp4, 0x1.fff7816603fbcp-48, 0x1.788c5ad05369p-101,
+     -0x1.25930261b069fp-155},
+    {-0x1.af89c00ed0004p5, -0x1.fa67f010d0ee8p-50, 0x1.6b414da3eda6cp-103,
+     0x1.fb3c9f2c26dd4p-156},
+    {-0x1.c00ed00043f4cp5, -0x1.fc04343b9d298p-48, 0x1.4da3eda6cfdap-103,
+     -0x1.b069ec9161738p-155},
+    {0x1.2fffbc0b301fcp5, 0x1.e5e2316b414dcp-47, -0x1.c125930261b08p-99,
+     0x1.6136e9e8c7ecdp-151},
+    {-0x1.0fd33f8086878p3, 0x1.8b5a0a6d1f6d4p-50, -0x1.30261b069ec9p-103,
+     -0x1.61738132c3403p-155},
+    {-0x1.9fc04343b9d28p4, -0x1.7d64b824b2604p-48, -0x1.86c1a7b24585cp-101,
+     -0x1.c09961a015d29p-154},
+    {-0x1.0d0ee74a5f594p2, 0x1.1f6d367ecf27cp-50, 0x1.6136e9e8c7eccp-103,
+     0x1.3cbfd45aea4f7p-155},
+    {-0x1.dce94beb25c14p5, 0x1.a6cfd9e4f9614p-47, -0x1.22c2e70265868p-100,
+     -0x1.5d28ad8453814p-158},
+    {-0x1.4beb25c12593p5, -0x1.30d834f648b0cp-50, 0x1.8fd9a797fa8b4p-104,
+     0x1.d49eeb1faf97cp-156},
+    {0x1.b47db4d9fb3c8p4, 0x1.f2c26dd3d18fcp-48, 0x1.9a797fa8b5d48p-100,
+     0x1.eeb1faf97c5edp-152},
+    {-0x1.25930261b06ap5, 0x1.36e9e8c7ecd3cp-47, 0x1.7fa8b5d49eebp-100,
+     0x1.faf97c5ecf41dp-152},
+    {0x1.fb3c9f2c26dd4p4, -0x1.738132c3402bcp-51, 0x1.aea4f758fd7ccp-103,
+     -0x1.d0985f18c10ebp-159},
+    {-0x1.b069ec9161738p5, -0x1.32c3402ba515cp-51, 0x1.eeb1faf97c5ecp-104,
+     0x1.e839cfbc52949p-157},
+    {-0x1.ec9161738132cp5, -0x1.a015d28ad8454p-50, 0x1.faf97c5ecf41cp-104,
+     0x1.cfbc529497536p-157},
+    {-0x1.61738132c3404p5, 0x1.45aea4f758fd8p-47, -0x1.a0e84c2f8c608p-102,
+     -0x1.d6b5b45650128p-156},
+    {0x1.fb34f2ff516bcp3, -0x1.6c229c0a0d074p-49, -0x1.30be31821d6b4p-104,
+     -0x1.b4565012813b8p-156},
+    {0x1.3cbfd45aea4f8p5, -0x1.4e050683a130cp-48, 0x1.ce7de294a4ba8p-104,
+     0x1.afed7ec47e357p-156},
+    {-0x1.5d28ad8453814p2, -0x1.a0e84c2f8c608p-54, -0x1.d6b5b45650128p-108,
+     -0x1.3b81ca8bdea7fp-164},
+    {-0x1.15b08a702834p5, -0x1.d0985f18c10ecp-47, 0x1.4a4ba9afed7ecp-100,
+     0x1.1f8d5d0856033p-154},
+};
 
-// For large range |x| >= 2^32, we use the exponent of x to find 3 double-chunks
-// of 128/pi c_hi, c_mid, c_lo such that:
-//   1) ulp(round(x * c_hi, D, RN)) >= 256,
+// For large range |x| >= 2^16, we perform the range reduction computations as:
+//   u = x - k * pi/128 = (pi/128) * (x * (128/pi) - k).
+// We use the exponent of x to find 4 double-chunks of 128/pi:
+// c_hi, c_mid, c_lo, c_lo_2 such that:
+//   1) ulp(round(x * c_hi, D, RN)) >= 2^8 = 256,
 //   2) If x * c_hi = ph_hi + ph_lo and x * c_mid = pm_hi + pm_lo, then
 //        min(ulp(ph_lo), ulp(pm_hi)) >= 2^-53.
-//   3) ulp(round(x * c_lo, D, RN)) <= 2^-7x.
-// This will allow us to do quick computations as:
-//   (x * 256/pi) ~ x * (c_hi + c_mid + c_lo)    (mod 256)
-//                ~ ph_lo + pm_hi + pm_lo + (x * c_lo)
+// This will allow us to drop the high part ph_hi and the addition:
+//   (ph_lo + pm_hi) mod 1
+// can be exactly representable in a double precision.
+// This will allow us to do split the computations as:
+//   (x * 256/pi) ~ x * (c_hi + c_mid + c_lo + c_lo_2)    (mod 256)
+//                ~ (ph_lo + pm_hi) + (pm_lo + x * c_lo) + x * c_lo_2.
 // Then,
 //   round(x * 128/pi) = round(ph_lo + pm_hi)    (mod 256)
 // And the high part of fractional part of (x * 128/pi) can simply be:
 //   {x * 128/pi}_hi = {ph_lo + pm_hi}.
 // To prevent overflow when x is very large, we simply scale up
-// (c_hi, c_mid, c_lo) by a fixed power of 2 (based on the index) and scale down
-// x by the same amount.
-
-template <bool NO_FMA> struct LargeRangeReduction {
-  // Calculate the high part of the range reduction exactly.
-  LIBC_INLINE unsigned compute_high_part(double x) {
-    using FPBits = typename fputil::FPBits<double>;
-    FPBits xbits(x);
-
-    // TODO: The extra exponent gap of 62 below can be reduced a bit for non-FMA
-    // with a more careful analysis, which in turn will reduce the error bound
-    // for non-FMA
-    int x_e_m62 = xbits.get_biased_exponent() - (FPBits::EXP_BIAS + 62);
-    idx = static_cast<unsigned>((x_e_m62 >> 4) + 3);
-    // Scale x down by 2^(-(16 * (idx - 3))
-    xbits.set_biased_exponent((x_e_m62 & 15) + FPBits::EXP_BIAS + 62);
-    // 2^62 <= |x_reduced| < 2^(62 + 16) = 2^78
-    x_reduced = xbits.get_val();
-    // x * c_hi = ph.hi + ph.lo exactly.
-    DoubleDouble ph =
-        fputil::exact_mult<NO_FMA>(x_reduced, ONE_TWENTY_EIGHT_OVER_PI[idx][0]);
-    // x * c_mid = pm.hi + pm.lo exactly.
-    DoubleDouble pm =
-        fputil::exact_mult<NO_FMA>(x_reduced, ONE_TWENTY_EIGHT_OVER_PI[idx][1]);
-    // Extract integral parts and fractional parts of (ph.lo + pm.hi).
-    double kh = fputil::nearest_integer(ph.lo);
-    double ph_lo_frac = ph.lo - kh; // Exact
-    double km = fputil::nearest_integer(pm.hi + ph_lo_frac);
-    double pm_hi_frac = pm.hi - km; // Exact
-    // x * 128/pi mod 1 ~ y_hi + y_lo
-    y_hi = ph_lo_frac + pm_hi_frac; // Exact
-    pm_lo = pm.lo;
-    return static_cast<unsigned>(static_cast<int64_t>(kh) +
-                                 static_cast<int64_t>(km));
-  }
+// (c_hi, c_mid, c_lo, c_lo_2) by a fixed power of 2 (based on the index) and
+// scale down x by the same amount.
 
-  LIBC_INLINE DoubleDouble fast() const {
-    // y_lo = x * c_lo + pm.lo
-    double y_lo = fputil::multiply_add(x_reduced,
-                                       ONE_TWENTY_EIGHT_OVER_PI[idx][2], pm_lo);
-    DoubleDouble y = fputil::exact_add(y_hi, y_lo);
-
-    // Digits of pi/128, generated by Sollya with:
-    // > a = round(pi/128, D, RN);
-    // > b = round(pi/128 - a, D, RN);
-    constexpr DoubleDouble PI_OVER_128_DD = {0x1.1a62633145c07p-60,
-                                             0x1.921fb54442d18p-6};
-
-    // Error bound: with {a} denote the fractional part of a, i.e.:
-    //   {a} = a - round(a)
-    // Then,
-    //   | {x * 128/pi} - (y_hi + y_lo) | <  2 * ulp(x_reduced *
-    //                                         * ONE_TWENTY_EIGHT_OVER_PI[idx][2])
-    // For FMA:
-    //   | {x * 128/pi} - (y_hi + y_lo) | <= 2 * 2^77 * 2^-103 * 2^-52
-    //                                    =  2^-77.
-    //   | {x mod pi/128} - (u.hi + u.lo) | < 2 * 2^-6 * 2^-77.
-    //                                      = 2^-82.
-    // For non-FMA:
-    //   | {x * 128/pi} - (y_hi + y_lo) | <= 2 * 2^77 * 2^-99 * 2^-52
-    //                                    =  2^-73.
-    //   | {x mod pi/128} - (u.hi + u.lo) | < 2 * 2^-6 * 2^-73.
-    //                                      = 2^-78.
-    return fputil::quick_mult<NO_FMA>(y, PI_OVER_128_DD);
-  }
+struct LargeRangeReduction {
+
+  // To be implemented in range_reduction_double_fma.h and
+  // range_reduction_double_nofma.h.
+  unsigned fast(double x, DoubleDouble &u);
 
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
   LIBC_INLINE Float128 accurate() const {
+    constexpr Float128 PI_OVER_128_F128 = {
+        Sign::POS, -133, 0xc90f'daa2'2168'c234'c4c6'628b'80dc'1cd1_u128};
+
     // y_lo = x * c_lo + pm.lo
     Float128 y_lo_0(x_reduced * ONE_TWENTY_EIGHT_OVER_PI[idx][3]);
-    Float128 y_lo_1 = fputil::quick_mul(
-        Float128(x_reduced), Float128(ONE_TWENTY_EIGHT_OVER_PI[idx][2]));
-    Float128 y_lo_2(pm_lo);
-    Float128 y_hi_f128(y_hi);
-
-    Float128 y = fputil::quick_add(
-        y_hi_f128,
-        fputil::quick_add(y_lo_2, fputil::quick_add(y_lo_1, y_lo_0)));
+    Float128 y_lo_1 = fputil::quick_add(Float128(y_lo), y_lo_0);
+    Float128 y_mid_f128 = fputil::quick_add(Float128(y_mid.lo), y_lo_1);
+    Float128 y_hi_f128 = fputil::quick_add(Float128(y_hi), Float128(y_mid.hi));
+    Float128 y = fputil::quick_add(y_hi_f128, y_mid_f128);
 
     return fputil::quick_mul(y, PI_OVER_128_F128);
   }
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 
 private:
   // Index of x in the look-up table ONE_TWENTY_EIGHT_OVER_PI.
   unsigned idx;
   // x scaled down by 2^(-16 *(idx - 3))).
   double x_reduced;
-  // High part of (x * 128/pi) mod 1.
-  double y_hi;
-  // Low part of x * ONE_TWENTY_EIGHT_OVER_PI[idx][1].
-  double pm_lo;
+  // Parts of (x * 128/pi) mod 1.
+  double y_hi, y_lo;
+  DoubleDouble y_mid;
 };
 
-LIBC_INLINE Float128 range_reduction_small_f128(double x) {
-  double prod_hi = x * ONE_TWENTY_EIGHT_OVER_PI[3][0];
+static Float128 range_reduction_small_f128(double x) {
+  constexpr Float128 PI_OVER_128_F128 = {
+      Sign::POS, -133, 0xc90f'daa2'2168'c234'c4c6'628b'80dc'1cd1_u128};
+  constexpr double ONE_TWENTY_EIGHT_OVER_PI_D = 0x1.45f306dc9c883p5;
+  double prod_hi = x * ONE_TWENTY_EIGHT_OVER_PI_D;
   double kd = fputil::nearest_integer(prod_hi);
 
   Float128 mk_f128(-kd);
@@ -178,7 +300,8 @@ LIBC_INLINE Float128 range_reduction_small_f128(double x) {
   return fputil::quick_mul(y, PI_OVER_128_F128);
 }
 
-LIBC_INLINE constexpr Float128 SIN_K_PI_OVER_128_F128[65] = {
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+static constexpr Float128 SIN_K_PI_OVER_128_F128[65] = {
     {Sign::POS, 0, 0},
     {Sign::POS, -133, 0xc90a'afbd'1b33'efc9'c539'edcb'fda0'cf2c_u128},
     {Sign::POS, -132, 0xc8fb'2f88'6ec0'9f37'6a17'954b'2b7c'5171_u128},
@@ -245,8 +368,7 @@ LIBC_INLINE constexpr Float128 SIN_K_PI_OVER_128_F128[65] = {
     {Sign::POS, -128, 0xffec'4304'2668'65d9'5657'5523'6696'1732_u128},
     {Sign::POS, 0, 1},
 };
-
-} // namespace generic
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 
 } // namespace LIBC_NAMESPACE_DECL
 
diff --git a/libc/src/math/generic/range_reduction_double_fma.h b/libc/src/math/generic/range_reduction_double_fma.h
index 7448b5f63dfde2..cab031c28baa17 100644
--- a/libc/src/math/generic/range_reduction_double_fma.h
+++ b/libc/src/math/generic/range_reduction_double_fma.h
@@ -15,174 +15,62 @@
 #include "src/__support/FPUtil/nearest_integer.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h"
+#include "src/math/generic/range_reduction_double_common.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-namespace fma {
-
 using LIBC_NAMESPACE::fputil::DoubleDouble;
 
-LIBC_INLINE constexpr int FAST_PASS_EXPONENT = 32;
+LIBC_INLINE unsigned LargeRangeReduction::fast(double x, DoubleDouble &u) {
+  using FPBits = typename fputil::FPBits<double>;
+  FPBits xbits(x);
 
-// Digits of 2^(16*i) / pi, generated by Sollya with:
-// For [2..62]:
-// >  for i from 3 to 63 do {
-//     pi_inv = 2^(16*(i - 3)) / pi;
-//     pn = nearestint(pi_inv);
-//     pi_frac = pi_inv - pn;
-//     a = round(pi_frac, D, RN);
-//     b = round(pi_frac - a, D, RN);
-//     c = round(pi_frac - a - b, D, RN);
-//     d = round(pi_frac - a - b - c, D, RN);
-//     print("{", 2^7 * a, ",", 2^7 * b, ",", 2^7 * c, ",", 2^7 * d, "},");
-//   };
-// For [0..1]:
-// The leading bit of 2^(16*(i - 3)) / pi is very small, so we add 0.25 so that
-// the conditions for the algorithms are still satisfied, and one of those
-// conditions guarantees that ulp(0.25 * x_reduced) >= 2, and will safely be
-// discarded.
-//  for i from 0 to 2 do {
-//     pi_frac = 0.25 + 2^(16*(i - 3)) / pi;
-//     a = round(pi_frac, D, RN);
-//     b = round(pi_frac - a, D, RN);
-//     c = round(pi_frac - a - b, D, RN);
-//     d = round(pi_frac - a - b - c, D, RN);
-//     print("{", 2^7 * a, ",", 2^7 * b, ",", 2^7 * c, ",", 2^7 * d, "},");
-//   };
-// For The fast pass using double-double, we only need 3 parts (a, b, c), but
-// for the accurate pass using Float128, instead of using another table of
-// Float128s, we simply add the fourth path (a, b, c, d), which simplify the
-// implementation a bit and saving some memory.
-LIBC_INLINE constexpr double ONE_TWENTY_EIGHT_OVER_PI[64][4] = {
-    {0x1.0000000000014p5, 0x1.7cc1b727220a9p-49, 0x1.3f84eafa3ea6ap-103,
-     -0x1.11f924eb53362p-157},
-    {0x1.0000000145f3p5, 0x1.b727220a94fe1p-49, 0x1.d5f47d4d37703p-104,
-     0x1.b6295993c439p-158},
-    {0x1.000145f306dcap5, -0x1.bbead603d8a83p-50, 0x1.f534ddc0db629p-106,
-     0x1.664f10e4107f9p-160},
-    {0x1.45f306dc9c883p5, -0x1.6b01ec5417056p-49, -0x1.6447e493ad4cep-103,
-     0x1.e21c820ff28b2p-157},
-    {-0x1.f246c6efab581p4, 0x1.3abe8fa9a6eep-53, 0x1.b6c52b3278872p-107,
-     0x1.07f9458eaf7afp-164},
-    {0x1.391054a7f09d6p4, -0x1.70565911f924fp-53, 0x1.2b3278872084p-107,
-     -0x1.ae9c5421443aap-162},
-    {0x1.529fc2757d1f5p2, 0x1.a6ee06db14acdp-53, -0x1.8778df7c035d4p-107,
-     0x1.d5ef5de2b0db9p-161},
-    {-0x1.ec54170565912p-1, 0x1.b6c52b3278872p-59, 0x1.07f9458eaf7afp-116,
-     -0x1.d4f246dc8e2dfp-173},
-    {-0x1.505c1596447e5p5, 0x1.b14acc9e21c82p-49, 0x1.fe5163abdebbcp-106,
-     0x1.586dc91b8e909p-160},
-    {-0x1.596447e493ad5p1, 0x1.93c439041fe51p-54, 0x1.8eaf7aef1586ep-108,
-     -0x1.b7238b7b645a4p-163},
-    {0x1.bb81b6c52b328p5, -0x1.de37df00d74e3p-49, 0x1.7bd778ac36e49p-103,
-     -0x1.1c5bdb22d1ffap-158},
-    {0x1.b6c52b3278872p5, 0x1.07f9458eaf7afp-52, -0x1.d4f246dc8e2dfp-109,
-     0x1.374b801924bbbp-164},
-    {0x1.2b3278872084p5, -0x1.ae9c5421443aap-50, 0x1.b7246e3a424ddp-106,
-     0x1.700324977504fp-161},
-    {-0x1.8778df7c035d4p5, 0x1.d5ef5de2b0db9p-49, 0x1.1b8e909374b8p-104,
-     0x1.924bba8274648p-160},
-    {-0x1.bef806ba71508p4, -0x1.443a9e48db91cp-50, -0x1.6f6c8b47fe6dbp-104,
-     -0x1.115f62e6de302p-158},
-    {-0x1.ae9c5421443aap-2, 0x1.b7246e3a424ddp-58, 0x1.700324977504fp-113,
-     -0x1.cdbc603c429c7p-167},
-    {-0x1.38a84288753c9p5, -0x1.b7238b7b645a4p-51, 0x1.924bba8274648p-112,
-     0x1.cfe1deb1cb12ap-166},
-    {-0x1.0a21d4f246dc9p3, 0x1.d2126e9700325p-53, -0x1.a22bec5cdbc6p-107,
-     -0x1.e214e34ed658cp-162},
-    {-0x1.d4f246dc8e2dfp3, 0x1.374b801924bbbp-52, -0x1.f62e6de301e21p-106,
-     -0x1.38d3b5963045ep-160},
-    {-0x1.236e4716f6c8bp4, -0x1.1ff9b6d115f63p-50, 0x1.921cfe1deb1cbp-106,
-     0x1.29a73ee88235fp-162},
-    {0x1.b8e909374b802p4, -0x1.b6d115f62e6dep-50, -0x1.80f10a71a76b3p-105,
-     0x1.cfba208d7d4bbp-160},
-    {0x1.09374b801924cp4, -0x1.15f62e6de301ep-50, -0x1.0a71a76b2c609p-105,
-     0x1.1046bea5d7689p-159},
-    {-0x1.68ffcdb688afbp3, -0x1.736f180f10a72p-53, 0x1.62534e7dd1047p-107,
-     -0x1.0568a25dbd8b3p-161},
-    {0x1.924bba8274648p0, 0x1.cfe1deb1cb12ap-54, -0x1.63045df7282b4p-108,
-     -0x1.44bb7b16638fep-162},
-    {-0x1.a22bec5cdbc6p5, -0x1.e214e34ed658cp-50, -0x1.177dca0ad144cp-106,
-     0x1.213a671c09ad1p-160},
-    {0x1.3a32439fc3bd6p1, 0x1.cb129a73ee882p-54, 0x1.afa975da24275p-109,
-     -0x1.8e3f652e8207p-164},
-    {-0x1.b78c0788538d4p4, 0x1.29a73ee88235fp-50, 0x1.4baed1213a672p-104,
-     -0x1.fb29741037d8dp-159},
-    {0x1.fc3bd63962535p5, -0x1.822efb9415a29p-51, 0x1.a24274ce38136p-105,
-     -0x1.741037d8cdc54p-159},
-    {-0x1.4e34ed658c117p2, -0x1.f7282b4512edfp-52, 0x1.d338e04d68bfp-107,
-     -0x1.bec66e29c67cbp-162},
-    {0x1.62534e7dd1047p5, -0x1.0568a25dbd8b3p-49, -0x1.c7eca5d040df6p-105,
-     -0x1.9b8a719f2b318p-160},
-    {-0x1.63045df7282b4p4, -0x1.44bb7b16638fep-50, 0x1.ad17df904e647p-104,
-     0x1.639835339f49dp-158},
-    {0x1.d1046bea5d769p5, -0x1.bd8b31c7eca5dp-49, -0x1.037d8cdc538dp-107,
-     0x1.a99cfa4e422fcp-161},
-    {0x1.afa975da24275p3, -0x1.8e3f652e8207p-52, 0x1.3991d63983534p-106,
-     -0x1.82d8dee81d108p-160},
-    {-0x1.a28976f62cc72p5, 0x1.35a2fbf209cc9p-53, -0x1.4e33e566305b2p-109,
-     0x1.08bf177bf2507p-163},
-    {-0x1.76f62cc71fb29p5, -0x1.d040df633714ep-49, -0x1.9f2b3182d8defp-104,
-     0x1.f8bbdf9283b2p-158},
-    {0x1.d338e04d68bfp5, -0x1.bec66e29c67cbp-50, 0x1.9cfa4e422fc5ep-105,
-     -0x1.036be27003b4p-161},
-    {0x1.c09ad17df904ep4, 0x1.91d639835339fp-50, 0x1.272117e2ef7e5p-104,
-     -0x1.7c4e007680022p-158},
-    {0x1.68befc827323bp5, -0x1.c67cacc60b638p-50, 0x1.17e2ef7e4a0ecp-104,
-     0x1.ff897ffde0598p-158},
-    {-0x1.037d8cdc538dp5, 0x1.a99cfa4e422fcp-49, 0x1.77bf250763ff1p-103,
-     0x1.7ffde05980fefp-158},
-    {-0x1.8cdc538cf9599p5, 0x1.f49c845f8bbep-50, -0x1.b5f13801da001p-104,
-     0x1.e05980fef2f12p-158},
-    {-0x1.4e33e566305b2p3, 0x1.08bf177bf2507p-51, 0x1.8ffc4bffef02dp-105,
-     -0x1.fc04343b9d298p-160},
-    {-0x1.f2b3182d8dee8p4, -0x1.d1081b5f13802p-52, 0x1.2fffbc0b301fep-107,
-     -0x1.a1dce94beb25cp-163},
-    {-0x1.8c16c6f740e88p5, -0x1.036be27003b4p-49, -0x1.0fd33f8086877p-109,
-     -0x1.d297d64b824b2p-164},
-    {0x1.3908bf177bf25p5, 0x1.d8ffc4bffef03p-53, -0x1.9fc04343b9d29p-108,
-     -0x1.f592e092c9813p-162},
-    {0x1.7e2ef7e4a0ec8p4, -0x1.da00087e99fcp-56, -0x1.0d0ee74a5f593p-110,
-     0x1.f6d367ecf27cbp-166},
-    {-0x1.081b5f13801dap4, -0x1.0fd33f8086877p-61, -0x1.d297d64b824b2p-116,
-     -0x1.8130d834f648bp-170},
-    {-0x1.af89c00ed0004p5, -0x1.fa67f010d0ee7p-50, -0x1.297d64b824b26p-104,
-     -0x1.30d834f648b0cp-162},
-    {-0x1.c00ed00043f4dp5, 0x1.fde5e2316b415p-55, -0x1.2e092c98130d8p-110,
-     -0x1.a7b24585ce04dp-165},
-    {0x1.2fffbc0b301fep5, -0x1.a1dce94beb25cp-51, -0x1.25930261b069fp-107,
-     0x1.b74f463f669e6p-162},
-    {-0x1.0fd33f8086877p3, -0x1.d297d64b824b2p-52, -0x1.8130d834f648bp-106,
-     -0x1.738132c3402bap-163},
-    {-0x1.9fc04343b9d29p4, -0x1.f592e092c9813p-50, -0x1.b069ec9161738p-107,
-     -0x1.32c3402ba515bp-163},
-    {-0x1.0d0ee74a5f593p2, 0x1.f6d367ecf27cbp-54, 0x1.36e9e8c7ecd3dp-111,
-     -0x1.00ae9456c229cp-165},
-    {-0x1.dce94beb25c12p5, -0x1.64c0986c1a7b2p-49, -0x1.161738132c34p-103,
-     -0x1.5d28ad8453814p-158},
-    {-0x1.4beb25c12593p5, -0x1.30d834f648b0cp-50, 0x1.8fd9a797fa8b6p-104,
-     -0x1.5b08a7028341dp-159},
-    {0x1.b47db4d9fb3cap4, -0x1.a7b24585ce04dp-53, 0x1.3cbfd45aea4f7p-107,
-     0x1.63f5f2f8bd9e8p-161},
-    {-0x1.25930261b069fp5, 0x1.b74f463f669e6p-50, -0x1.5d28ad8453814p-110,
-     -0x1.a0e84c2f8c608p-166},
-    {0x1.fb3c9f2c26dd4p4, -0x1.738132c3402bap-51, -0x1.456c229c0a0dp-105,
-     -0x1.d0985f18c10ebp-159},
-    {-0x1.b069ec9161738p5, -0x1.32c3402ba515bp-51, -0x1.14e050683a131p-108,
-     0x1.0739f78a5292fp-162},
-    {-0x1.ec9161738132cp5, -0x1.a015d28ad8454p-50, 0x1.faf97c5ecf41dp-104,
-     -0x1.821d6b5b4565p-160},
-    {-0x1.61738132c3403p5, 0x1.16ba93dd63f5fp-49, 0x1.7c5ecf41ce7dep-104,
-     0x1.4a525d4d7f6bfp-159},
-    {0x1.fb34f2ff516bbp3, -0x1.b08a7028341d1p-51, 0x1.9e839cfbc5295p-105,
-     -0x1.a2b2809409dc1p-159},
-    {0x1.3cbfd45aea4f7p5, 0x1.63f5f2f8bd9e8p-49, 0x1.ce7de294a4baap-104,
-     -0x1.404a04ee072a3p-158},
-    {-0x1.5d28ad8453814p2, -0x1.a0e84c2f8c608p-54, -0x1.d6b5b45650128p-108,
-     -0x1.3b81ca8bdea7fp-164},
-    {-0x1.15b08a7028342p5, 0x1.7b3d0739f78a5p-50, 0x1.497535fdafd89p-105,
-     -0x1.ca8bdea7f33eep-164},
-};
+  int x_e_m62 = xbits.get_biased_exponent() - (FPBits::EXP_BIAS + 62);
+  idx = static_cast<unsigned>((x_e_m62 >> 4) + 3);
+  // Scale x down by 2^(-(16 * (idx - 3))
+  xbits.set_biased_exponent((x_e_m62 & 15) + FPBits::EXP_BIAS + 62);
+  // 2^62 <= |x_reduced| < 2^(62 + 16) = 2^78
+  x_reduced = xbits.get_val();
+  // x * c_hi = ph.hi + ph.lo exactly.
+  DoubleDouble ph =
+      fputil::exact_mult<SPLIT>(x_reduced, ONE_TWENTY_EIGHT_OVER_PI[idx][0]);
+  // x * c_mid = pm.hi + pm.lo exactly.
+  DoubleDouble pm =
+      fputil::exact_mult<SPLIT>(x_reduced, ONE_TWENTY_EIGHT_OVER_PI[idx][1]);
+  // x * c_lo = pl.hi + pl.lo exactly.
+  DoubleDouble pl =
+      fputil::exact_mult<SPLIT>(x_reduced, ONE_TWENTY_EIGHT_OVER_PI[idx][2]);
+  // Extract integral parts and fractional parts of (ph.lo + pm.hi).
+  double sum_hi = ph.lo + pm.hi;
+  double kd = fputil::nearest_integer(sum_hi);
+
+  // x * 128/pi mod 1 ~ y_hi + y_mid + y_lo
+  y_hi = (ph.lo - kd) + pm.hi; // Exact
+  y_mid = fputil::exact_add(pm.lo, pl.hi);
+  y_lo = pl.lo;
+
+  // y_l = x * c_lo_2 + pl.lo
+  double y_l =
+      fputil::multiply_add(x_reduced, ONE_TWENTY_EIGHT_OVER_PI[idx][3], y_lo);
+  DoubleDouble y = fputil::exact_add(y_hi, y_mid.hi);
+  y.lo += (y_mid.lo + y_l);
+
+  // Digits of pi/128, generated by Sollya with:
+  // > a = round(pi/128, D, RN);
+  // > b = round(pi/128 - a, D, RN);
+  constexpr DoubleDouble PI_OVER_128_DD = {0x1.1a62633145c07p-60,
+                                           0x1.921fb54442d18p-6};
+
+  // Error bound: with {a} denote the fractional part of a, i.e.:
+  //   {a} = a - round(a)
+  // Then,
+  //   | {x * 128/pi} - (y_hi + y_lo) | <=  ulp(ulp(y_hi)) <= 2^-105
+  //   | {x mod pi/128} - (u.hi + u.lo) | < 2 * 2^-6 * 2^-105 = 2^-110
+  u = fputil::quick_mult<SPLIT>(y, PI_OVER_128_DD);
+
+  return static_cast<unsigned>(static_cast<int64_t>(kd));
+}
 
 // Lookup table for sin(k * pi / 128) with k = 0, ..., 255.
 // Table is generated with Sollya as follow:
@@ -258,6 +146,7 @@ LIBC_INLINE constexpr DoubleDouble SIN_K_PI_OVER_128[256] = {
     {-0x1.c57bc2e24aa15p-57, 0x1.ff621e3796d7ep-1},
     {-0x1.1354d4556e4cbp-55, 0x1.ffd886084cd0dp-1},
     {0, 1},
+#ifndef LIBC_MATH_HAS_SMALL_TABLES
     {-0x1.1354d4556e4cbp-55, 0x1.ffd886084cd0dp-1},
     {-0x1.c57bc2e24aa15p-57, 0x1.ff621e3796d7ep-1},
     {0x1.521ecd0c67e35p-57, 0x1.fe9cdad01883ap-1},
@@ -449,48 +338,9 @@ LIBC_INLINE constexpr DoubleDouble SIN_K_PI_OVER_128[256] = {
     {0x1.9a088a8bf6b2cp-59, -0x1.2d52092ce19f6p-4},
     {0x1.912bd0d569a9p-61, -0x1.91f65f10dd814p-5},
     {0x1.b1d63091a013p-64, -0x1.92155f7a3667ep-6},
+#endif // !LIBC_MATH_HAS_SMALL_TABLES
 };
 
-// For |x| < 2^-32, return k and u such that:
-//   k = round(x * 128/pi)
-//   x mod pi/128 = x - k * pi/128 ~ u.hi + u.lo
-LIBC_INLINE unsigned range_reduction_small(double x, DoubleDouble &u) {
-  // Digits of pi/128, generated by Sollya with:
-  // > a = round(pi/128, D, RN);
-  // > b = round(pi/128 - a, D, RN);
-  constexpr DoubleDouble PI_OVER_128_DD = {0x1.1a62633145c07p-60,
-                                           0x1.921fb54442d18p-6};
-
-  double prod_hi = x * ONE_TWENTY_EIGHT_OVER_PI[3][0];
-  double kd = fputil::nearest_integer(prod_hi);
-
-  // Let y = x - k * (pi/128)
-  // Then |y| < pi / 256
-  // With extra rounding errors, we can bound |y| < 2^-6.
-  double y_hi = fputil::multiply_add(kd, -PI_OVER_128_DD.hi, x); // Exact
-  // u_hi + u_lo ~ (y_hi + kd*(-PI_OVER_128_DD[1]))
-  // and |u_lo| < 2* ulp(u_hi)
-  // The upper bound 2^-6 is over-estimated, we should still have:
-  // |u_hi + u_lo| < 2^-6.
-  u.hi = fputil::multiply_add(kd, -PI_OVER_128_DD.lo, y_hi);
-  u.lo = y_hi - u.hi; // Exact;
-  u.lo = fputil::multiply_add(kd, -PI_OVER_128_DD.lo, u.lo);
-  // Error bound:
-  // For |x| < 2^32:
-  //   |x * high part of 128/pi| < 2^32 * 2^6 = 2^38
-  // So |k| = |round(x * high part of 128/pi)| < 2^38
-  // And hence,
-  //   |(x mod pi/128) - (u.hi + u.lo)| <= ulp(2 * kd * PI_OVER_128_DD.lo)
-  //                                    < 2 * 2^38 * 2^-59 * 2^-52
-  //                                    = 2^-72
-  // Note: if we limit the input exponent to the same as in non-FMA version,
-  // i.e., |x| < 2^-23, then the output errors can be bounded by 2^-81, similar
-  // to the large range reduction bound.
-  return static_cast<unsigned>(static_cast<int64_t>(kd));
-}
-
-} // namespace fma
-
 } // namespace LIBC_NAMESPACE_DECL
 
 #endif // LLVM_LIBC_SRC_MATH_GENERIC_RANGE_REDUCTION_DOUBLE_FMA_H
diff --git a/libc/src/math/generic/range_reduction_double_nofma.h b/libc/src/math/generic/range_reduction_double_nofma.h
index 445a45d3f9796a..56407329477989 100644
--- a/libc/src/math/generic/range_reduction_double_nofma.h
+++ b/libc/src/math/generic/range_reduction_double_nofma.h
@@ -15,174 +15,63 @@
 #include "src/__support/FPUtil/nearest_integer.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h"
+#include "src/math/generic/range_reduction_double_common.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-namespace nofma {
-
 using fputil::DoubleDouble;
 
-LIBC_INLINE constexpr int FAST_PASS_EXPONENT = 23;
+LIBC_INLINE unsigned LargeRangeReduction::fast(double x, DoubleDouble &u) {
+  using FPBits = typename fputil::FPBits<double>;
+  FPBits xbits(x);
 
-// Digits of 2^(16*i) / pi, generated by Sollya with:
-// For [2..62]:
-// >  for i from 3 to 63 do {
-//     pi_inv = 2^(16*(i - 3)) / pi;
-//     pn = nearestint(pi_inv);
-//     pi_frac = pi_inv - pn;
-//     a = round(pi_frac, 51, RN);
-//     b = round(pi_frac - a, 51, RN);
-//     c = round(pi_frac - a - b, D, RN);
-//     d = round(pi_frac - a - b - c, D, RN);
-//     print("{", 2^7 * a, ",", 2^7 * b, ",", 2^7 * c, ",", 2^7 * d, "},");
-//   };
-// For [0..1]:
-// The leading bit of 2^(16*(i - 3)) / pi is very small, so we add 0.25 so that
-// the conditions for the algorithms are still satisfied, and one of those
-// conditions guarantees that ulp(0.25 * x_reduced) >= 2, and will safely be
-// discarded.
-//  for i from 0 to 2 do {
-//     pi_frac = 0.25 + 2^(16*(i - 3)) / pi;
-//     a = round(pi_frac, 51, RN);
-//     b = round(pi_frac - a, 51, RN);
-//     c = round(pi_frac - a - b, D, RN);
-//     d = round(pi_frac - a - b - c, D, RN);
-//     print("{", 2^7 * a, ",", 2^7 * b, ",", 2^7 * c, ",", 2^7 * d, "},");
-//   };
-// For The fast pass using double-double, we only need 3 parts (a, b, c), but
-// for the accurate pass using Float128, instead of using another table of
-// Float128s, we simply add the fourth path (a, b, c, d), which simplify the
-// implementation a bit and saving some memory.
-LIBC_INLINE constexpr double ONE_TWENTY_EIGHT_OVER_PI[64][4] = {
-    {0x1.0000000000014p5, 0x1.7cc1b727220a8p-49, 0x1.4fe13abe8fa9ap-101,
-     0x1.bb81b6c52b328p-155},
-    {0x1.0000000145f3p5, 0x1.b727220a94fep-49, 0x1.3abe8fa9a6eep-101,
-     0x1.b6c52b3278872p-155},
-    {0x1.000145f306dc8p5, 0x1.c882a53f84ebp-47, -0x1.70565911f924fp-101,
-     0x1.2b3278872084p-155},
-    {0x1.45f306dc9c884p5, -0x1.5ac07b1505c14p-47, -0x1.96447e493ad4dp-99,
-     0x1.3c439041fe516p-154},
-    {-0x1.f246c6efab58p4, -0x1.ec5417056591p-49, -0x1.f924eb53361dep-101,
-     -0x1.bef806ba71508p-156},
-    {0x1.391054a7f09d4p4, 0x1.f47d4d377036cp-48, 0x1.8a5664f10e41p-100,
-     0x1.fe5163abdebbcp-154},
-    {0x1.529fc2757d1f4p2, 0x1.34ddc0db62958p-50, 0x1.93c439041fe51p-102,
-     0x1.8eaf7aef1586ep-156},
-    {-0x1.ec5417056591p-1, -0x1.f924eb53361ep-53, 0x1.c820ff28b1d5fp-105,
-     -0x1.443a9e48db91cp-162},
-    {-0x1.505c1596447e4p5, -0x1.275a99b0ef1cp-48, 0x1.07f9458eaf7afp-100,
-     -0x1.d4f246dc8e2dfp-157},
-    {-0x1.596447e493ad4p1, -0x1.9b0ef1bef806cp-52, 0x1.63abdebbc561bp-106,
-     0x1.c91b8e909374cp-160},
-    {0x1.bb81b6c52b328p5, -0x1.de37df00d74e4p-49, 0x1.5ef5de2b0db92p-101,
-     0x1.b8e909374b802p-156},
-    {0x1.b6c52b3278874p5, -0x1.f7c035d38a844p-47, 0x1.778ac36e48dc7p-99,
-     0x1.2126e97003249p-153},
-    {0x1.2b3278872084p5, -0x1.ae9c5421443a8p-50, -0x1.e48db91c5bdb2p-102,
-     -0x1.68ffcdb688afbp-157},
-    {-0x1.8778df7c035d4p5, 0x1.d5ef5de2b0db8p-49, 0x1.2371d2126e97p-101,
-     0x1.924bba8274648p-160},
-    {-0x1.bef806ba71508p4, -0x1.443a9e48db91cp-50, -0x1.6f6c8b47fe6dbp-104,
-     -0x1.115f62e6de302p-158},
-    {-0x1.ae9c5421443a8p-2, -0x1.e48db91c5bdb4p-54, 0x1.d2e006492eea1p-106,
-     -0x1.8b9b78c078854p-160},
-    {-0x1.38a84288753c8p5, -0x1.1b7238b7b645cp-47, 0x1.c00c925dd413ap-99,
-     0x1.921cfe1deb1cbp-154},
-    {-0x1.0a21d4f246dc8p3, -0x1.c5bdb22d1ff9cp-50, 0x1.25dd413a3243ap-103,
-     -0x1.e214e34ed658cp-162},
-    {-0x1.d4f246dc8e2ep3, 0x1.26e9700324978p-49, -0x1.5f62e6de301e2p-102,
-     -0x1.4e34ed658c117p-158},
-    {-0x1.236e4716f6c8cp4, 0x1.700324977505p-49, -0x1.736f180f10a72p-101,
-     0x1.62534e7dd1047p-155},
-    {0x1.b8e909374b8p4, 0x1.924bba8274648p-48, 0x1.cfe1deb1cb12ap-102,
-     -0x1.63045df7282b4p-156},
-    {0x1.09374b801924cp4, -0x1.15f62e6de302p-50, 0x1.deb1cb129a73fp-102,
-     -0x1.77dca0ad144bbp-158},
-    {-0x1.68ffcdb688afcp3, 0x1.d1921cfe1debp-50, 0x1.cb129a73ee882p-102,
-     0x1.afa975da24275p-157},
-    {0x1.924bba8274648p0, 0x1.cfe1deb1cb128p-54, 0x1.a73ee88235f53p-106,
-     -0x1.44bb7b16638fep-162},
-    {-0x1.a22bec5cdbc6p5, -0x1.e214e34ed658cp-50, -0x1.177dca0ad144cp-106,
-     0x1.213a671c09ad1p-160},
-    {0x1.3a32439fc3bd8p1, -0x1.c69dacb1822fp-51, 0x1.1afa975da2427p-105,
-     0x1.338e04d68befdp-159},
-    {-0x1.b78c0788538d4p4, 0x1.29a73ee88236p-50, -0x1.5a28976f62cc7p-103,
-     -0x1.fb29741037d8dp-159},
-    {0x1.fc3bd63962534p5, 0x1.cfba208d7d4bcp-48, -0x1.12edec598e3f6p-100,
-     -0x1.4ba081bec66e3p-154},
-    {-0x1.4e34ed658c118p2, 0x1.046bea5d7689p-51, 0x1.3a671c09ad17ep-104,
-     -0x1.bec66e29c67cbp-162},
-    {0x1.62534e7dd1048p5, -0x1.415a28976f62cp-47, -0x1.8e3f652e8207p-100,
-     0x1.3991d63983534p-154},
-    {-0x1.63045df7282b4p4, -0x1.44bb7b16638fcp-50, -0x1.94ba081bec66ep-102,
-     -0x1.4e33e566305b2p-157},
-    {0x1.d1046bea5d768p5, 0x1.213a671c09adp-48, 0x1.7df904e64758ep-100,
-     0x1.835339f49c846p-154},
-    {0x1.afa975da24274p3, 0x1.9c7026b45f7e4p-50, 0x1.3991d63983534p-106,
-     -0x1.82d8dee81d108p-160},
-    {-0x1.a28976f62cc7p5, -0x1.fb29741037d8cp-47, -0x1.b8a719f2b3183p-100,
-     0x1.3908bf177bf25p-155},
-    {-0x1.76f62cc71fb28p5, -0x1.741037d8cdc54p-47, 0x1.cc1a99cfa4e42p-101,
-     0x1.7e2ef7e4a0ec8p-156},
-    {0x1.d338e04d68bfp5, -0x1.bec66e29c67ccp-50, 0x1.339f49c845f8cp-102,
-     -0x1.081b5f13801dap-156},
-    {0x1.c09ad17df905p4, -0x1.9b8a719f2b318p-48, -0x1.6c6f740e8840ep-103,
-     0x1.41d8ffc4bffefp-157},
-    {0x1.68befc827323cp5, -0x1.38cf9598c16c8p-47, 0x1.08bf177bf2507p-99,
-     0x1.8ffc4bffef02dp-153},
-    {-0x1.037d8cdc538dp5, 0x1.a99cfa4e422fcp-49, 0x1.77bf250763ff1p-103,
-     0x1.7ffde05980fefp-158},
-    {-0x1.8cdc538cf9598p5, -0x1.82d8dee81d108p-48, -0x1.b5f13801da001p-104,
-     0x1.e05980fef2f12p-158},
-    {-0x1.4e33e566305bp3, -0x1.bdd03a21036cp-49, 0x1.d8ffc4bffef03p-101,
-     -0x1.9fc04343b9d29p-156},
-    {-0x1.f2b3182d8dee8p4, -0x1.d1081b5f138p-52, -0x1.da00087e99fcp-104,
-     -0x1.0d0ee74a5f593p-158},
-    {-0x1.8c16c6f740e88p5, -0x1.036be27003b4p-49, -0x1.0fd33f8086877p-109,
-     -0x1.d297d64b824b2p-164},
-    {0x1.3908bf177bf24p5, 0x1.0763ff12fffbcp-47, 0x1.6603fbcbc462dp-104,
-     0x1.a0a6d1f6d367fp-158},
-    {0x1.7e2ef7e4a0ec8p4, -0x1.da00087e99fcp-56, -0x1.0d0ee74a5f593p-110,
-     0x1.f6d367ecf27cbp-166},
-    {-0x1.081b5f13801dcp4, 0x1.fff7816603fbcp-48, 0x1.788c5ad05369p-101,
-     -0x1.25930261b069fp-155},
-    {-0x1.af89c00ed0004p5, -0x1.fa67f010d0ee8p-50, 0x1.6b414da3eda6dp-103,
-     -0x1.30d834f648b0cp-162},
-    {-0x1.c00ed00043f4cp5, -0x1.fc04343b9d298p-48, 0x1.4da3eda6cfd9ep-103,
-     0x1.3e584dba7a32p-157},
-    {0x1.2fffbc0b301fcp5, 0x1.e5e2316b414dcp-47, -0x1.c125930261b07p-99,
-     0x1.84dba7a31fb35p-153},
-    {-0x1.0fd33f8086878p3, 0x1.8b5a0a6d1f6d4p-50, -0x1.30261b069ec91p-103,
-     -0x1.85ce04cb0d00bp-157},
-    {-0x1.9fc04343b9d28p4, -0x1.7d64b824b2604p-48, -0x1.86c1a7b24585dp-101,
-     0x1.fb34f2ff516bbp-157},
-    {-0x1.0d0ee74a5f594p2, 0x1.1f6d367ecf27cp-50, 0x1.6136e9e8c7ecdp-103,
-     0x1.e5fea2d7527bbp-158},
-    {-0x1.dce94beb25c14p5, 0x1.a6cfd9e4f9614p-47, -0x1.22c2e70265868p-100,
-     -0x1.5d28ad8453814p-158},
-    {-0x1.4beb25c12593p5, -0x1.30d834f648b0cp-50, 0x1.8fd9a797fa8b6p-104,
-     -0x1.5b08a7028341dp-159},
-    {0x1.b47db4d9fb3c8p4, 0x1.f2c26dd3d18fcp-48, 0x1.9a797fa8b5d4ap-100,
-     -0x1.14e050683a131p-156},
-    {-0x1.25930261b06ap5, 0x1.36e9e8c7ecd3cp-47, 0x1.7fa8b5d49eeb2p-100,
-     -0x1.41a0e84c2f8c6p-158},
-    {0x1.fb3c9f2c26dd4p4, -0x1.738132c3402bcp-51, 0x1.aea4f758fd7ccp-103,
-     -0x1.d0985f18c10ebp-159},
-    {-0x1.b069ec9161738p5, -0x1.32c3402ba515cp-51, 0x1.eeb1faf97c5edp-104,
-     -0x1.7c63043ad6b69p-161},
-    {-0x1.ec9161738132cp5, -0x1.a015d28ad8454p-50, 0x1.faf97c5ecf41dp-104,
-     -0x1.821d6b5b4565p-160},
-    {-0x1.61738132c3404p5, 0x1.45aea4f758fd8p-47, -0x1.a0e84c2f8c608p-102,
-     -0x1.d6b5b45650128p-156},
-    {0x1.fb34f2ff516bcp3, -0x1.6c229c0a0d074p-49, -0x1.30be31821d6b6p-104,
-     0x1.2ea6bfb5fb12p-158},
-    {0x1.3cbfd45aea4f8p5, -0x1.4e050683a130cp-48, 0x1.ce7de294a4baap-104,
-     -0x1.404a04ee072a3p-158},
-    {-0x1.5d28ad8453814p2, -0x1.a0e84c2f8c608p-54, -0x1.d6b5b45650128p-108,
-     -0x1.3b81ca8bdea7fp-164},
-    {-0x1.15b08a702834p5, -0x1.d0985f18c10ecp-47, 0x1.4a4ba9afed7ecp-100,
-     0x1.1f8d5d0856033p-154},
-};
+  int x_e_m62 = xbits.get_biased_exponent() - (FPBits::EXP_BIAS + 62);
+  idx = static_cast<unsigned>((x_e_m62 >> 4) + 3);
+  // Scale x down by 2^(-(16 * (idx - 3))
+  xbits.set_biased_exponent((x_e_m62 & 15) + FPBits::EXP_BIAS + 62);
+  // 2^62 <= |x_reduced| < 2^(62 + 16) = 2^78
+  x_reduced = xbits.get_val();
+  // x * c_hi = ph.hi + ph.lo exactly.
+  DoubleDouble x_split = fputil::split(x_reduced);
+  DoubleDouble ph = fputil::exact_mult<SPLIT>(x_split, x_reduced,
+                                              ONE_TWENTY_EIGHT_OVER_PI[idx][0]);
+  // x * c_mid = pm.hi + pm.lo exactly.
+  DoubleDouble pm = fputil::exact_mult<SPLIT>(x_split, x_reduced,
+                                              ONE_TWENTY_EIGHT_OVER_PI[idx][1]);
+  // x * c_lo = pl.hi + pl.lo exactly.
+  DoubleDouble pl = fputil::exact_mult<SPLIT>(x_split, x_reduced,
+                                              ONE_TWENTY_EIGHT_OVER_PI[idx][2]);
+  // Extract integral parts and fractional parts of (ph.lo + pm.hi).
+  double sum_hi = ph.lo + pm.hi;
+  double kd = fputil::nearest_integer(sum_hi);
+
+  // x * 128/pi mod 1 ~ y_hi + y_mid + y_lo
+  y_hi = (ph.lo - kd) + pm.hi; // Exact
+  y_mid = fputil::exact_add(pm.lo, pl.hi);
+  y_lo = pl.lo;
+
+  // y_l = x * c_lo_2 + pl.lo
+  double y_l =
+      fputil::multiply_add(x_reduced, ONE_TWENTY_EIGHT_OVER_PI[idx][3], y_lo);
+  DoubleDouble y = fputil::exact_add(y_hi, y_mid.hi);
+  y.lo += (y_mid.lo + y_l);
+
+  // Digits of pi/128, generated by Sollya with:
+  // > a = round(pi/128, D, RN);
+  // > b = round(pi/128 - a, D, RN);
+  constexpr DoubleDouble PI_OVER_128_DD = {0x1.1a62633145c07p-60,
+                                           0x1.921fb54442d18p-6};
+
+  // Error bound: with {a} denote the fractional part of a, i.e.:
+  //   {a} = a - round(a)
+  // Then,
+  //   | {x * 128/pi} - (y_hi + y_lo) | <=  ulp(ulp(y_hi)) <= 2^-105
+  //   | {x mod pi/128} - (u.hi + u.lo) | < 2 * 2^-6 * 2^-105 = 2^-110
+  u = fputil::quick_mult<SPLIT>(y, PI_OVER_128_DD);
+
+  return static_cast<unsigned>(static_cast<int64_t>(kd));
+}
 
 // Lookup table for sin(k * pi / 128) with k = 0, ..., 255.
 // Table is generated with Sollya as follow:
@@ -258,6 +147,7 @@ LIBC_INLINE constexpr DoubleDouble SIN_K_PI_OVER_128[256] = {
     {0x1.e3a843d1db55fp-53, 0x1.ff621e3796d7cp-1},
     {0x1.765595d548d9ap-54, 0x1.ffd886084cd0cp-1},
     {0, 1},
+#ifndef LIBC_MATH_HAS_SMALL_TABLES
     {0x1.765595d548d9ap-54, 0x1.ffd886084cd0cp-1},
     {0x1.e3a843d1db55fp-53, 0x1.ff621e3796d7cp-1},
     {-0x1.eade132f3981dp-53, 0x1.fe9cdad01883cp-1},
@@ -449,46 +339,9 @@ LIBC_INLINE constexpr DoubleDouble SIN_K_PI_OVER_128[256] = {
     {-0x1.ccbeeeae8129ap-56, -0x1.2d52092ce19f4p-4},
     {0x1.912bd0d569a9p-61, -0x1.91f65f10dd814p-5},
     {-0x1.f938a73db97fbp-58, -0x1.92155f7a3667cp-6},
+#endif // !LIBC_MATH_HAS_SMALL_TABLES
 };
 
-LIBC_INLINE unsigned range_reduction_small(double x, DoubleDouble &u) {
-  constexpr double ONE_TWENTY_EIGHT_OVER_PI = 0x1.45f306dc9c883p5;
-
-  // Digits of -pi/128, generated by Sollya with:
-  // > a = round(-pi/128, 25, RN);
-  // > b = round(-pi/128 - a, 23, RN);
-  // > c = round(-pi/128 - a - b, 25, RN);
-  // > d = round(-pi/128 - a - b - c, D, RN);
-  //   -pi/128 ~ a + b + c + d
-  // The precisions of the parts are chosen so that:
-  // 1)  k * a, k * b, k * c are exact in double precision
-  // 2)  k * b + (x - (k * a)) is exact in double precsion
-  constexpr double MPI_OVER_128[4] = {-0x1.921fb5p-6, -0x1.110b48p-32,
-                                      +0x1.ee59dap-56, -0x1.98a2e03707345p-83};
-
-  double prod_hi = x * ONE_TWENTY_EIGHT_OVER_PI;
-  double kd = fputil::nearest_integer(prod_hi);
-
-  // With -pi/128 ~ a + b + c + d as in MPI_OVER_128 description:
-  // t = x + k * a
-  double t = fputil::multiply_add(kd, MPI_OVER_128[0], x); // Exact
-  // y_hi = t + k * b = (x + k * a) + k * b
-  double y_hi = fputil::multiply_add(kd, MPI_OVER_128[1], t); // Exact
-  // y_lo ~ k * c + k * d
-  double y_lo = fputil::multiply_add(kd, MPI_OVER_128[2], kd * MPI_OVER_128[3]);
-  // u.hi + u.lo ~ x + k * (a + b + c + d)
-  u = fputil::exact_add(y_hi, y_lo);
-  // Error bound: For |x| < 2^-23,
-  //  |(x mod pi/128) - (u_hi + u_lo)| <  ulp(y_lo)
-  //                                   <= ulp(2 * x * c)
-  //                                   <= ulp(2^24 * 2^-56)
-  //                                   =  2^(24 - 56 - 52)
-  //                                   =  2^-84
-  return static_cast<unsigned>(static_cast<int>(kd));
-}
-
-} // namespace nofma
-
 } // namespace LIBC_NAMESPACE_DECL
 
 #endif // LLVM_LIBC_SRC_MATH_GENERIC_RANGE_REDUCTION_DOUBLE_NOFMA_H
diff --git a/libc/src/math/generic/sin.cpp b/libc/src/math/generic/sin.cpp
index da3d1e94b5f645..2e1d3ffd5f37d8 100644
--- a/libc/src/math/generic/sin.cpp
+++ b/libc/src/math/generic/sin.cpp
@@ -18,17 +18,14 @@
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h"            // LIBC_UNLIKELY
 #include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA
+#include "src/math/generic/range_reduction_double_common.h"
 #include "src/math/generic/sincos_eval.h"
 
-// TODO: We might be able to improve the performance of large range reduction of
-// non-FMA targets further by operating directly on 25-bit chunks of 128/pi and
-// pre-split SIN_K_PI_OVER_128, but that might double the memory footprint of
-// those lookup table.
-#include "range_reduction_double_common.h"
-
-#if ((LIBC_MATH & LIBC_MATH_SKIP_ACCURATE_PASS) != 0)
-#define LIBC_MATH_SIN_SKIP_ACCURATE_PASS
-#endif
+#ifdef LIBC_TARGET_CPU_HAS_FMA
+#include "range_reduction_double_fma.h"
+#else
+#include "range_reduction_double_nofma.h"
+#endif // LIBC_TARGET_CPU_HAS_FMA
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -43,33 +40,39 @@ LLVM_LIBC_FUNCTION(double, sin, (double x)) {
 
   DoubleDouble y;
   unsigned k;
-  generic::LargeRangeReduction<NO_FMA> range_reduction_large{};
+  LargeRangeReduction range_reduction_large{};
 
-  // |x| < 2^32 (with FMA) or |x| < 2^23 (w/o FMA)
+  // |x| < 2^16
   if (LIBC_LIKELY(x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT)) {
-    // |x| < 2^-26
-    if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 26)) {
-      // Signed zeros.
-      if (LIBC_UNLIKELY(x == 0.0))
-        return x;
+    // |x| < 2^-7
+    if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 7)) {
+      // |x| < 2^-26, |sin(x) - x| < ulp(x)/2.
+      if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 26)) {
+        // Signed zeros.
+        if (LIBC_UNLIKELY(x == 0.0))
+          return x;
 
-        // For |x| < 2^-26, |sin(x) - x| < ulp(x)/2.
 #ifdef LIBC_TARGET_CPU_HAS_FMA
-      return fputil::multiply_add(x, -0x1.0p-54, x);
+        return fputil::multiply_add(x, -0x1.0p-54, x);
 #else
-      if (LIBC_UNLIKELY(x_e < 4)) {
-        int rounding_mode = fputil::quick_get_round();
-        if (rounding_mode == FE_TOWARDZERO ||
-            (xbits.sign() == Sign::POS && rounding_mode == FE_DOWNWARD) ||
-            (xbits.sign() == Sign::NEG && rounding_mode == FE_UPWARD))
-          return FPBits(xbits.uintval() - 1).get_val();
-      }
-      return fputil::multiply_add(x, -0x1.0p-54, x);
+        if (LIBC_UNLIKELY(x_e < 4)) {
+          int rounding_mode = fputil::quick_get_round();
+          if (rounding_mode == FE_TOWARDZERO ||
+              (xbits.sign() == Sign::POS && rounding_mode == FE_DOWNWARD) ||
+              (xbits.sign() == Sign::NEG && rounding_mode == FE_UPWARD))
+            return FPBits(xbits.uintval() - 1).get_val();
+        }
+        return fputil::multiply_add(x, -0x1.0p-54, x);
 #endif // LIBC_TARGET_CPU_HAS_FMA
+      }
+      // No range reduction needed.
+      k = 0;
+      y.lo = 0.0;
+      y.hi = x;
+    } else {
+      // Small range reduction.
+      k = range_reduction_small(x, y);
     }
-
-    // // Small range reduction.
-    k = range_reduction_small(x, y);
   } else {
     // Inf or NaN
     if (LIBC_UNLIKELY(x_e > 2 * FPBits::EXP_BIAS)) {
@@ -82,69 +85,51 @@ LLVM_LIBC_FUNCTION(double, sin, (double x)) {
     }
 
     // Large range reduction.
-    k = range_reduction_large.compute_high_part(x);
-    y = range_reduction_large.fast();
+    k = range_reduction_large.fast(x, y);
   }
 
   DoubleDouble sin_y, cos_y;
 
-  generic::sincos_eval(y, sin_y, cos_y);
+  [[maybe_unused]] double err = generic::sincos_eval(y, sin_y, cos_y);
 
   // Look up sin(k * pi/128) and cos(k * pi/128)
-  // Memory saving versions:
-
-  // Use 128-entry table instead:
-  // DoubleDouble sin_k = SIN_K_PI_OVER_128[k & 127];
-  // uint64_t sin_s = static_cast<uint64_t>(k & 128) << (63 - 7);
-  // sin_k.hi = FPBits(FPBits(sin_k.hi).uintval() ^ sin_s).get_val();
-  // sin_k.lo = FPBits(FPBits(sin_k.hi).uintval() ^ sin_s).get_val();
-  // DoubleDouble cos_k = SIN_K_PI_OVER_128[(k + 64) & 127];
-  // uint64_t cos_s = static_cast<uint64_t>((k + 64) & 128) << (63 - 7);
-  // cos_k.hi = FPBits(FPBits(cos_k.hi).uintval() ^ cos_s).get_val();
-  // cos_k.lo = FPBits(FPBits(cos_k.hi).uintval() ^ cos_s).get_val();
-
-  // Use 64-entry table instead:
-  // auto get_idx_dd = [](unsigned kk) -> DoubleDouble {
-  //   unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63);
-  //   DoubleDouble ans = SIN_K_PI_OVER_128[idx];
-  //   if (kk & 128) {
-  //     ans.hi = -ans.hi;
-  //     ans.lo = -ans.lo;
-  //   }
-  //   return ans;
-  // };
-  // DoubleDouble sin_k = get_idx_dd(k);
-  // DoubleDouble cos_k = get_idx_dd(k + 64);
-
+#ifdef LIBC_MATH_HAS_SMALL_TABLES
+  // Memory saving versions.  Use 65-entry table.
+  auto get_idx_dd = [](unsigned kk) -> DoubleDouble {
+    unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63);
+    DoubleDouble ans = SIN_K_PI_OVER_128[idx];
+    if (kk & 128) {
+      ans.hi = -ans.hi;
+      ans.lo = -ans.lo;
+    }
+    return ans;
+  };
+  DoubleDouble sin_k = get_idx_dd(k);
+  DoubleDouble cos_k = get_idx_dd(k + 64);
+#else
   // Fast look up version, but needs 256-entry table.
   // cos(k * pi/128) = sin(k * pi/128 + pi/2) = sin((k + 64) * pi/128).
   DoubleDouble sin_k = SIN_K_PI_OVER_128[k & 255];
   DoubleDouble cos_k = SIN_K_PI_OVER_128[(k + 64) & 255];
+#endif
 
   // After range reduction, k = round(x * 128 / pi) and y = x - k * (pi / 128).
   // So k is an integer and -pi / 256 <= y <= pi / 256.
   // Then sin(x) = sin((k * pi/128 + y)
   //             = sin(y) * cos(k*pi/128) + cos(y) * sin(k*pi/128)
-  DoubleDouble sin_k_cos_y = fputil::quick_mult<NO_FMA>(cos_y, sin_k);
-  DoubleDouble cos_k_sin_y = fputil::quick_mult<NO_FMA>(sin_y, cos_k);
+  DoubleDouble sin_k_cos_y = fputil::quick_mult(cos_y, sin_k);
+  DoubleDouble cos_k_sin_y = fputil::quick_mult(sin_y, cos_k);
 
   DoubleDouble rr = fputil::exact_add<false>(sin_k_cos_y.hi, cos_k_sin_y.hi);
   rr.lo += sin_k_cos_y.lo + cos_k_sin_y.lo;
 
-#ifdef LIBC_MATH_SIN_SKIP_ACCURATE_PASS
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
   return rr.hi + rr.lo;
 #else
   // Accurate test and pass for correctly rounded implementation.
 
-#ifdef LIBC_TARGET_CPU_HAS_FMA
-  constexpr double ERR = 0x1.0p-70;
-#else
-  // TODO: Improve non-FMA fast pass accuracy.
-  constexpr double ERR = 0x1.0p-66;
-#endif // LIBC_TARGET_CPU_HAS_FMA
-
-  double rlp = rr.lo + ERR;
-  double rlm = rr.lo - ERR;
+  double rlp = rr.lo + err;
+  double rlm = rr.lo - err;
 
   double r_upper = rr.hi + rlp; // (rr.lo + ERR);
   double r_lower = rr.hi + rlm; // (rr.lo - ERR);
@@ -155,7 +140,7 @@ LLVM_LIBC_FUNCTION(double, sin, (double x)) {
 
   Float128 u_f128, sin_u, cos_u;
   if (LIBC_LIKELY(x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT))
-    u_f128 = generic::range_reduction_small_f128(x);
+    u_f128 = range_reduction_small_f128(x);
   else
     u_f128 = range_reduction_large.accurate();
 
@@ -163,7 +148,7 @@ LLVM_LIBC_FUNCTION(double, sin, (double x)) {
 
   auto get_sin_k = [](unsigned kk) -> Float128 {
     unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63);
-    Float128 ans = generic::SIN_K_PI_OVER_128_F128[idx];
+    Float128 ans = SIN_K_PI_OVER_128_F128[idx];
     if (kk & 128)
       ans.sign = Sign::NEG;
     return ans;
@@ -182,7 +167,7 @@ LLVM_LIBC_FUNCTION(double, sin, (double x)) {
   // https://github.com/llvm/llvm-project/issues/96452.
 
   return static_cast<double>(r);
-#endif // !LIBC_MATH_SIN_SKIP_ACCURATE_PASS
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/sincos.cpp b/libc/src/math/generic/sincos.cpp
index 1af0ee7b0eb2c8..166ce466031409 100644
--- a/libc/src/math/generic/sincos.cpp
+++ b/libc/src/math/generic/sincos.cpp
@@ -19,17 +19,14 @@
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h"            // LIBC_UNLIKELY
 #include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA
+#include "src/math/generic/range_reduction_double_common.h"
 #include "src/math/generic/sincos_eval.h"
 
-// TODO: We might be able to improve the performance of large range reduction of
-// non-FMA targets further by operating directly on 25-bit chunks of 128/pi and
-// pre-split SIN_K_PI_OVER_128, but that might double the memory footprint of
-// those lookup table.
-#include "range_reduction_double_common.h"
-
-#if ((LIBC_MATH & LIBC_MATH_SKIP_ACCURATE_PASS) != 0)
-#define LIBC_MATH_SINCOS_SKIP_ACCURATE_PASS
-#endif
+#ifdef LIBC_TARGET_CPU_HAS_FMA
+#include "range_reduction_double_fma.h"
+#else
+#include "range_reduction_double_nofma.h"
+#endif // LIBC_TARGET_CPU_HAS_FMA
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -44,40 +41,47 @@ LLVM_LIBC_FUNCTION(void, sincos, (double x, double *sin_x, double *cos_x)) {
 
   DoubleDouble y;
   unsigned k;
-  generic::LargeRangeReduction<NO_FMA> range_reduction_large{};
+  LargeRangeReduction range_reduction_large{};
 
-  // |x| < 2^32 (with FMA) or |x| < 2^23 (w/o FMA)
+  // |x| < 2^16
   if (LIBC_LIKELY(x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT)) {
-    // |x| < 2^-27
-    if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 27)) {
-      // Signed zeros.
-      if (LIBC_UNLIKELY(x == 0.0)) {
-        *sin_x = x;
-        *cos_x = 1.0;
-        return;
-      }
-
-      // For |x| < 2^-27, max(|sin(x) - x|, |cos(x) - 1|) < ulp(x)/2.
+    // |x| < 2^-7
+    if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 7)) {
+      // |x| < 2^-27
+      if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 27)) {
+        // Signed zeros.
+        if (LIBC_UNLIKELY(x == 0.0)) {
+          *sin_x = x;
+          *cos_x = 1.0;
+          return;
+        }
+
+        // For |x| < 2^-27, max(|sin(x) - x|, |cos(x) - 1|) < ulp(x)/2.
 #ifdef LIBC_TARGET_CPU_HAS_FMA
-      *sin_x = fputil::multiply_add(x, -0x1.0p-54, x);
-      *cos_x = fputil::multiply_add(x, -x, 1.0);
+        *sin_x = fputil::multiply_add(x, -0x1.0p-54, x);
+        *cos_x = fputil::multiply_add(x, -x, 1.0);
 #else
-      *cos_x = fputil::round_result_slightly_down(1.0);
-
-      if (LIBC_UNLIKELY(x_e < 4)) {
-        int rounding_mode = fputil::quick_get_round();
-        if (rounding_mode == FE_TOWARDZERO ||
-            (xbits.sign() == Sign::POS && rounding_mode == FE_DOWNWARD) ||
-            (xbits.sign() == Sign::NEG && rounding_mode == FE_UPWARD))
-          *sin_x = FPBits(xbits.uintval() - 1).get_val();
-      }
-      *sin_x = fputil::multiply_add(x, -0x1.0p-54, x);
+        *cos_x = fputil::round_result_slightly_down(1.0);
+
+        if (LIBC_UNLIKELY(x_e < 4)) {
+          int rounding_mode = fputil::quick_get_round();
+          if (rounding_mode == FE_TOWARDZERO ||
+              (xbits.sign() == Sign::POS && rounding_mode == FE_DOWNWARD) ||
+              (xbits.sign() == Sign::NEG && rounding_mode == FE_UPWARD))
+            *sin_x = FPBits(xbits.uintval() - 1).get_val();
+        }
+        *sin_x = fputil::multiply_add(x, -0x1.0p-54, x);
 #endif // LIBC_TARGET_CPU_HAS_FMA
-      return;
+        return;
+      }
+      // No range reduction needed.
+      k = 0;
+      y.lo = 0.0;
+      y.hi = x;
+    } else {
+      // Small range reduction.
+      k = range_reduction_small(x, y);
     }
-
-    // // Small range reduction.
-    k = range_reduction_small(x, y);
   } else {
     // Inf or NaN
     if (LIBC_UNLIKELY(x_e > 2 * FPBits::EXP_BIAS)) {
@@ -91,56 +95,46 @@ LLVM_LIBC_FUNCTION(void, sincos, (double x, double *sin_x, double *cos_x)) {
     }
 
     // Large range reduction.
-    k = range_reduction_large.compute_high_part(x);
-    y = range_reduction_large.fast();
+    k = range_reduction_large.fast(x, y);
   }
 
   DoubleDouble sin_y, cos_y;
 
-  generic::sincos_eval(y, sin_y, cos_y);
+  [[maybe_unused]] double err = generic::sincos_eval(y, sin_y, cos_y);
 
   // Look up sin(k * pi/128) and cos(k * pi/128)
-  // Memory saving versions:
-
-  // Use 128-entry table instead:
-  // DoubleDouble sin_k = SIN_K_PI_OVER_128[k & 127];
-  // uint64_t sin_s = static_cast<uint64_t>(k & 128) << (63 - 7);
-  // sin_k.hi = FPBits(FPBits(sin_k.hi).uintval() ^ sin_s).get_val();
-  // sin_k.lo = FPBits(FPBits(sin_k.hi).uintval() ^ sin_s).get_val();
-  // DoubleDouble cos_k = SIN_K_PI_OVER_128[(k + 64) & 127];
-  // uint64_t cos_s = static_cast<uint64_t>((k + 64) & 128) << (63 - 7);
-  // cos_k.hi = FPBits(FPBits(cos_k.hi).uintval() ^ cos_s).get_val();
-  // cos_k.lo = FPBits(FPBits(cos_k.hi).uintval() ^ cos_s).get_val();
-
-  // Use 64-entry table instead:
-  // auto get_idx_dd = [](unsigned kk) -> DoubleDouble {
-  //   unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63);
-  //   DoubleDouble ans = SIN_K_PI_OVER_128[idx];
-  //   if (kk & 128) {
-  //     ans.hi = -ans.hi;
-  //     ans.lo = -ans.lo;
-  //   }
-  //   return ans;
-  // };
-  // DoubleDouble sin_k = get_idx_dd(k);
-  // DoubleDouble cos_k = get_idx_dd(k + 64);
-
+#ifdef LIBC_MATH_HAS_SMALL_TABLES
+  // Memory saving versions.  Use 65-entry table.
+  auto get_idx_dd = [](unsigned kk) -> DoubleDouble {
+    unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63);
+    DoubleDouble ans = SIN_K_PI_OVER_128[idx];
+    if (kk & 128) {
+      ans.hi = -ans.hi;
+      ans.lo = -ans.lo;
+    }
+    return ans;
+  };
+  DoubleDouble sin_k = get_idx_dd(k);
+  DoubleDouble cos_k = get_idx_dd(k + 64);
+#else
   // Fast look up version, but needs 256-entry table.
   // cos(k * pi/128) = sin(k * pi/128 + pi/2) = sin((k + 64) * pi/128).
   DoubleDouble sin_k = SIN_K_PI_OVER_128[k & 255];
   DoubleDouble cos_k = SIN_K_PI_OVER_128[(k + 64) & 255];
+#endif // LIBC_MATH_HAS_SMALL_TABLES
+
   DoubleDouble msin_k{-sin_k.lo, -sin_k.hi};
 
   // After range reduction, k = round(x * 128 / pi) and y = x - k * (pi / 128).
   // So k is an integer and -pi / 256 <= y <= pi / 256.
   // Then sin(x) = sin((k * pi/128 + y)
   //             = sin(y) * cos(k*pi/128) + cos(y) * sin(k*pi/128)
-  DoubleDouble sin_k_cos_y = fputil::quick_mult<NO_FMA>(cos_y, sin_k);
-  DoubleDouble cos_k_sin_y = fputil::quick_mult<NO_FMA>(sin_y, cos_k);
+  DoubleDouble sin_k_cos_y = fputil::quick_mult(cos_y, sin_k);
+  DoubleDouble cos_k_sin_y = fputil::quick_mult(sin_y, cos_k);
   //      cos(x) = cos((k * pi/128 + y)
   //             = cos(y) * cos(k*pi/128) - sin(y) * sin(k*pi/128)
-  DoubleDouble cos_k_cos_y = fputil::quick_mult<NO_FMA>(cos_y, cos_k);
-  DoubleDouble msin_k_sin_y = fputil::quick_mult<NO_FMA>(sin_y, msin_k);
+  DoubleDouble cos_k_cos_y = fputil::quick_mult(cos_y, cos_k);
+  DoubleDouble msin_k_sin_y = fputil::quick_mult(sin_y, msin_k);
 
   DoubleDouble sin_dd =
       fputil::exact_add<false>(sin_k_cos_y.hi, cos_k_sin_y.hi);
@@ -149,24 +143,17 @@ LLVM_LIBC_FUNCTION(void, sincos, (double x, double *sin_x, double *cos_x)) {
   sin_dd.lo += sin_k_cos_y.lo + cos_k_sin_y.lo;
   cos_dd.lo += msin_k_sin_y.lo + cos_k_cos_y.lo;
 
-#ifdef LIBC_MATH_SINCOS_SKIP_ACCURATE_PASS
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
   *sin_x = sin_dd.hi + sin_dd.lo;
   *cos_x = cos_dd.hi + cos_dd.lo;
   return;
 #else
   // Accurate test and pass for correctly rounded implementation.
 
-#ifdef LIBC_TARGET_CPU_HAS_FMA
-  constexpr double ERR = 0x1.0p-70;
-#else
-  // TODO: Improve non-FMA fast pass accuracy.
-  constexpr double ERR = 0x1.0p-66;
-#endif // LIBC_TARGET_CPU_HAS_FMA
-
-  double sin_lp = sin_dd.lo + ERR;
-  double sin_lm = sin_dd.lo - ERR;
-  double cos_lp = cos_dd.lo + ERR;
-  double cos_lm = cos_dd.lo - ERR;
+  double sin_lp = sin_dd.lo + err;
+  double sin_lm = sin_dd.lo - err;
+  double cos_lp = cos_dd.lo + err;
+  double cos_lm = cos_dd.lo - err;
 
   double sin_upper = sin_dd.hi + sin_lp;
   double sin_lower = sin_dd.hi + sin_lm;
@@ -182,7 +169,7 @@ LLVM_LIBC_FUNCTION(void, sincos, (double x, double *sin_x, double *cos_x)) {
 
   Float128 u_f128, sin_u, cos_u;
   if (LIBC_LIKELY(x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT))
-    u_f128 = generic::range_reduction_small_f128(x);
+    u_f128 = range_reduction_small_f128(x);
   else
     u_f128 = range_reduction_large.accurate();
 
@@ -190,7 +177,7 @@ LLVM_LIBC_FUNCTION(void, sincos, (double x, double *sin_x, double *cos_x)) {
 
   auto get_sin_k = [](unsigned kk) -> Float128 {
     unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63);
-    Float128 ans = generic::SIN_K_PI_OVER_128_F128[idx];
+    Float128 ans = SIN_K_PI_OVER_128_F128[idx];
     if (kk & 128)
       ans.sign = Sign::NEG;
     return ans;
@@ -222,7 +209,7 @@ LLVM_LIBC_FUNCTION(void, sincos, (double x, double *sin_x, double *cos_x)) {
         fputil::quick_add(fputil::quick_mul(cos_k_f128, cos_u),
                           fputil::quick_mul(msin_k_f128, sin_u)));
 
-#endif // !LIBC_MATH_SINCOS_SKIP_ACCURATE_PASS
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/sincos_eval.h b/libc/src/math/generic/sincos_eval.h
index e491467c5663fd..6cd1da4721bf57 100644
--- a/libc/src/math/generic/sincos_eval.h
+++ b/libc/src/math/generic/sincos_eval.h
@@ -23,8 +23,8 @@ namespace generic {
 using fputil::DoubleDouble;
 using Float128 = fputil::DyadicFloat<128>;
 
-LIBC_INLINE void sincos_eval(const DoubleDouble &u, DoubleDouble &sin_u,
-                             DoubleDouble &cos_u) {
+LIBC_INLINE double sincos_eval(const DoubleDouble &u, DoubleDouble &sin_u,
+                               DoubleDouble &cos_u) {
   // Evaluate sin(y) = sin(x - k * (pi/128))
   // We use the degree-7 Taylor approximation:
   //   sin(y) ~ y - y^3/3! + y^5/5! - y^7/7!
@@ -61,9 +61,19 @@ LIBC_INLINE void sincos_eval(const DoubleDouble &u, DoubleDouble &sin_u,
   //     + u_hi u_lo (-1 + u_hi^2/6)
   // We compute 1 - u_hi^2 accurately:
   //   v_hi + v_lo ~ 1 - u_hi^2/2
-  double v_hi = fputil::multiply_add(u.hi, u.hi * (-0.5), 1.0);
-  double v_lo = 1.0 - v_hi; // Exact
-  v_lo = fputil::multiply_add(u.hi, u.hi * (-0.5), v_lo);
+  // with error <= 2^-105.
+  double u_hi_neg_half = (-0.5) * u.hi;
+  DoubleDouble v;
+
+#ifdef LIBC_TARGET_CPU_HAS_FMA
+  v.hi = fputil::multiply_add(u.hi, u_hi_neg_half, 1.0);
+  v.lo = 1.0 - v.hi; // Exact
+  v.lo = fputil::multiply_add(u.hi, u_hi_neg_half, v.lo);
+#else
+  DoubleDouble u_hi_sq_neg_half = fputil::exact_mult(u.hi, u_hi_neg_half);
+  v = fputil::exact_add(1.0, u_hi_sq_neg_half.hi);
+  v.lo += u_hi_sq_neg_half.lo;
+#endif // LIBC_TARGET_CPU_HAS_FMA
 
   // r1 ~ -1/720 + u_hi^2 / 40320
   double r1 = fputil::multiply_add(u_hi_sq, 0x1.a01a01a01a01ap-16,
@@ -75,12 +85,15 @@ LIBC_INLINE void sincos_eval(const DoubleDouble &u, DoubleDouble &sin_u,
   // r2 ~ 1/24 + u_hi^2 (-1/720 + u_hi^2 / 40320)
   double r2 = fputil::multiply_add(u_hi_sq, r1, 0x1.5555555555555p-5);
   // s2 ~ v_lo + u_hi * u_lo * (-1 + u_hi^2 / 6)
-  double s2 = fputil::multiply_add(u_hi_u_lo, s1, v_lo);
+  double s2 = fputil::multiply_add(u_hi_u_lo, s1, v.lo);
   double cos_lo = fputil::multiply_add(u_hi_4, r2, s2);
   // Overall, |cos(y) - (v_hi + cos_lo)| < 2*ulp(u_hi^4) < 2^-75.
 
   sin_u = fputil::exact_add(u.hi, sin_lo);
-  cos_u = fputil::exact_add(v_hi, cos_lo);
+  cos_u = fputil::exact_add(v.hi, cos_lo);
+
+  return fputil::multiply_add(fputil::FPBits<double>(u_hi_3).abs().get_val(),
+                              0x1.0p-51, 0x1.0p-105);
 }
 
 LIBC_INLINE void sincos_eval(const Float128 &u, Float128 &sin_u,
diff --git a/libc/src/math/generic/tan.cpp b/libc/src/math/generic/tan.cpp
index 45fd8bb9156be0..f9be25ed866e1d 100644
--- a/libc/src/math/generic/tan.cpp
+++ b/libc/src/math/generic/tan.cpp
@@ -20,16 +20,13 @@
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h"            // LIBC_UNLIKELY
 #include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA
+#include "src/math/generic/range_reduction_double_common.h"
 
-// TODO: We might be able to improve the performance of large range reduction of
-// non-FMA targets further by operating directly on 25-bit chunks of 128/pi and
-// pre-split SIN_K_PI_OVER_128, but that might double the memory footprint of
-// those lookup table.
-#include "range_reduction_double_common.h"
-
-#if ((LIBC_MATH & LIBC_MATH_SKIP_ACCURATE_PASS) != 0)
-#define LIBC_MATH_TAN_SKIP_ACCURATE_PASS
-#endif
+#ifdef LIBC_TARGET_CPU_HAS_FMA
+#include "range_reduction_double_fma.h"
+#else
+#include "range_reduction_double_nofma.h"
+#endif // LIBC_TARGET_CPU_HAS_FMA
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -38,7 +35,7 @@ using Float128 = typename fputil::DyadicFloat<128>;
 
 namespace {
 
-LIBC_INLINE DoubleDouble tan_eval(const DoubleDouble &u) {
+LIBC_INLINE double tan_eval(const DoubleDouble &u, DoubleDouble &result) {
   // Evaluate tan(y) = tan(x - k * (pi/128))
   // We use the degree-9 Taylor approximation:
   //   tan(y) ~ P(y) = y + y^3/3 + 2*y^5/15 + 17*y^7/315 + 62*y^9/2835
@@ -69,10 +66,12 @@ LIBC_INLINE DoubleDouble tan_eval(const DoubleDouble &u) {
   // Overall, |tan(y) - (u_hi + tan_lo)| < ulp(u_hi^3) <= 2^-71.
   // And the relative errors is:
   // |(tan(y) - (u_hi + tan_lo)) / tan(y) | <= 2*ulp(u_hi^2) < 2^-64
-
-  return fputil::exact_add(u.hi, tan_lo);
+  result = fputil::exact_add(u.hi, tan_lo);
+  return fputil::multiply_add(fputil::FPBits<double>(u_hi_3).abs().get_val(),
+                              0x1.0p-51, 0x1.0p-102);
 }
 
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 // Accurate evaluation of tan for small u.
 [[maybe_unused]] Float128 tan_eval(const Float128 &u) {
   Float128 u_sq = fputil::quick_mul(u, u);
@@ -117,6 +116,7 @@ LIBC_INLINE DoubleDouble tan_eval(const DoubleDouble &u) {
       fputil::quick_mul(q1, fputil::quick_add(TWO, fputil::quick_mul(b, q1)));
   return fputil::quick_mul(a, q2);
 }
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 
 } // anonymous namespace
 
@@ -128,33 +128,38 @@ LLVM_LIBC_FUNCTION(double, tan, (double x)) {
 
   DoubleDouble y;
   unsigned k;
-  generic::LargeRangeReduction<NO_FMA> range_reduction_large{};
+  LargeRangeReduction range_reduction_large{};
 
-  // |x| < 2^32 (with FMA) or |x| < 2^23 (w/o FMA)
+  // |x| < 2^16
   if (LIBC_LIKELY(x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT)) {
-    // |x| < 2^-27
-    if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 27)) {
-      // Signed zeros.
-      if (LIBC_UNLIKELY(x == 0.0))
-        return x;
+    // |x| < 2^-7
+    if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 7)) {
+      // |x| < 2^-27, |tan(x) - x| < ulp(x)/2.
+      if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 27)) {
+        // Signed zeros.
+        if (LIBC_UNLIKELY(x == 0.0))
+          return x;
 
-        // For |x| < 2^-27, |tan(x) - x| < ulp(x)/2.
 #ifdef LIBC_TARGET_CPU_HAS_FMA
-      return fputil::multiply_add(x, 0x1.0p-54, x);
+        return fputil::multiply_add(x, 0x1.0p-54, x);
 #else
-      if (LIBC_UNLIKELY(x_e < 4)) {
-        int rounding_mode = fputil::quick_get_round();
-        if (rounding_mode == FE_TOWARDZERO ||
-            (xbits.sign() == Sign::POS && rounding_mode == FE_DOWNWARD) ||
-            (xbits.sign() == Sign::NEG && rounding_mode == FE_UPWARD))
-          return FPBits(xbits.uintval() + 1).get_val();
-      }
-      return fputil::multiply_add(x, 0x1.0p-54, x);
+        if (LIBC_UNLIKELY(x_e < 4)) {
+          int rounding_mode = fputil::quick_get_round();
+          if ((xbits.sign() == Sign::POS && rounding_mode == FE_UPWARD) ||
+              (xbits.sign() == Sign::NEG && rounding_mode == FE_DOWNWARD))
+            return FPBits(xbits.uintval() + 1).get_val();
+        }
+        return fputil::multiply_add(x, 0x1.0p-54, x);
 #endif // LIBC_TARGET_CPU_HAS_FMA
+      }
+      // No range reduction needed.
+      k = 0;
+      y.lo = 0.0;
+      y.hi = x;
+    } else {
+      // Small range reduction.
+      k = range_reduction_small(x, y);
     }
-
-    // // Small range reduction.
-    k = range_reduction_small(x, y);
   } else {
     // Inf or NaN
     if (LIBC_UNLIKELY(x_e > 2 * FPBits::EXP_BIAS)) {
@@ -167,42 +172,32 @@ LLVM_LIBC_FUNCTION(double, tan, (double x)) {
     }
 
     // Large range reduction.
-    k = range_reduction_large.compute_high_part(x);
-    y = range_reduction_large.fast();
+    k = range_reduction_large.fast(x, y);
   }
 
-  DoubleDouble tan_y = tan_eval(y);
+  DoubleDouble tan_y;
+  [[maybe_unused]] double err = tan_eval(y, tan_y);
 
   // Look up sin(k * pi/128) and cos(k * pi/128)
-  // Memory saving versions:
-
-  // Use 128-entry table instead:
-  // DoubleDouble sin_k = SIN_K_PI_OVER_128[k & 127];
-  // uint64_t sin_s = static_cast<uint64_t>(k & 128) << (63 - 7);
-  // sin_k.hi = FPBits(FPBits(sin_k.hi).uintval() ^ sin_s).get_val();
-  // sin_k.lo = FPBits(FPBits(sin_k.hi).uintval() ^ sin_s).get_val();
-  // DoubleDouble cos_k = SIN_K_PI_OVER_128[(k + 64) & 127];
-  // uint64_t cos_s = static_cast<uint64_t>((k + 64) & 128) << (63 - 7);
-  // cos_k.hi = FPBits(FPBits(cos_k.hi).uintval() ^ cos_s).get_val();
-  // cos_k.lo = FPBits(FPBits(cos_k.hi).uintval() ^ cos_s).get_val();
-
-  // Use 64-entry table instead:
-  // auto get_idx_dd = [](unsigned kk) -> DoubleDouble {
-  //   unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63);
-  //   DoubleDouble ans = SIN_K_PI_OVER_128[idx];
-  //   if (kk & 128) {
-  //     ans.hi = -ans.hi;
-  //     ans.lo = -ans.lo;
-  //   }
-  //   return ans;
-  // };
-  // DoubleDouble msin_k = get_idx_dd(k + 128);
-  // DoubleDouble cos_k = get_idx_dd(k + 64);
-
+#ifdef LIBC_MATH_HAS_SMALL_TABLES
+  // Memory saving versions. Use 65-entry table:
+  auto get_idx_dd = [](unsigned kk) -> DoubleDouble {
+    unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63);
+    DoubleDouble ans = SIN_K_PI_OVER_128[idx];
+    if (kk & 128) {
+      ans.hi = -ans.hi;
+      ans.lo = -ans.lo;
+    }
+    return ans;
+  };
+  DoubleDouble msin_k = get_idx_dd(k + 128);
+  DoubleDouble cos_k = get_idx_dd(k + 64);
+#else
   // Fast look up version, but needs 256-entry table.
   // cos(k * pi/128) = sin(k * pi/128 + pi/2) = sin((k + 64) * pi/128).
   DoubleDouble msin_k = SIN_K_PI_OVER_128[(k + 128) & 255];
   DoubleDouble cos_k = SIN_K_PI_OVER_128[(k + 64) & 255];
+#endif // LIBC_MATH_HAS_SMALL_TABLES
 
   // After range reduction, k = round(x * 128 / pi) and y = x - k * (pi / 128).
   // So k is an integer and -pi / 256 <= y <= pi / 256.
@@ -212,8 +207,8 @@ LLVM_LIBC_FUNCTION(double, tan, (double x)) {
   //               / (cos(y) * cos(k*pi/128) - sin(y) * sin(k*pi/128))
   //             = (sin(k*pi/128) + tan(y) * cos(k*pi/128)) /
   //               / (cos(k*pi/128) - tan(y) * sin(k*pi/128))
-  DoubleDouble cos_k_tan_y = fputil::quick_mult<NO_FMA>(tan_y, cos_k);
-  DoubleDouble msin_k_tan_y = fputil::quick_mult<NO_FMA>(tan_y, msin_k);
+  DoubleDouble cos_k_tan_y = fputil::quick_mult(tan_y, cos_k);
+  DoubleDouble msin_k_tan_y = fputil::quick_mult(tan_y, msin_k);
 
   // num_dd = sin(k*pi/128) + tan(y) * cos(k*pi/128)
   DoubleDouble num_dd = fputil::exact_add<false>(cos_k_tan_y.hi, -msin_k.hi);
@@ -222,7 +217,7 @@ LLVM_LIBC_FUNCTION(double, tan, (double x)) {
   num_dd.lo += cos_k_tan_y.lo - msin_k.lo;
   den_dd.lo += msin_k_tan_y.lo + cos_k.lo;
 
-#ifdef LIBC_MATH_TAN_SKIP_ACCURATE_PASS
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
   double tan_x = (num_dd.hi + num_dd.lo) / (den_dd.hi + den_dd.lo);
   return tan_x;
 #else
@@ -231,18 +226,16 @@ LLVM_LIBC_FUNCTION(double, tan, (double x)) {
   // Accurate double-double division
   DoubleDouble tan_x = fputil::div(num_dd, den_dd);
 
-  // Relative errors for k != 0 mod 64 is:
-  //   absolute errors / min(sin(k*pi/128), cos(k*pi/128)) <= 2^-71 / 2^-7
-  //                                                        = 2^-64.
-  // For k = 0 mod 64, the relative errors is bounded by:
-  //   2^-71 / 2^(exponent of x).
-  constexpr int ERR = 64;
+  // Simple error bound: |1 / den_dd| < 2^(1 + floor(-log2(den_dd)))).
+  uint64_t den_inv = (static_cast<uint64_t>(FPBits::EXP_BIAS + 1)
+                      << (FPBits::FRACTION_LEN + 1)) -
+                     (FPBits(den_dd.hi).uintval() & FPBits::EXP_MASK);
 
-  int y_exp = 7 + FPBits(y.hi).get_exponent();
-  int rel_err_exp = ERR + static_cast<int>((k & 63) == 0) * y_exp;
-  int64_t tan_x_err = static_cast<int64_t>(FPBits(tan_x.hi).uintval()) -
-                      (static_cast<int64_t>(rel_err_exp) << 52);
-  double tan_err = FPBits(static_cast<uint64_t>(tan_x_err)).get_val();
+  // For tan_x = (num_dd + err) / (den_dd + err), the error is bounded by:
+  //   | tan_x - num_dd / den_dd |  <= err * ( 1 + | tan_x * den_dd | ).
+  double tan_err =
+      err * fputil::multiply_add(FPBits(den_inv).get_val(),
+                                 FPBits(tan_x.hi).abs().get_val(), 1.0);
 
   double err_higher = tan_x.lo + tan_err;
   double err_lower = tan_x.lo - tan_err;
@@ -256,7 +249,7 @@ LLVM_LIBC_FUNCTION(double, tan, (double x)) {
 
   Float128 u_f128;
   if (LIBC_LIKELY(x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT))
-    u_f128 = generic::range_reduction_small_f128(x);
+    u_f128 = range_reduction_small_f128(x);
   else
     u_f128 = range_reduction_large.accurate();
 
@@ -264,7 +257,7 @@ LLVM_LIBC_FUNCTION(double, tan, (double x)) {
 
   auto get_sin_k = [](unsigned kk) -> Float128 {
     unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63);
-    Float128 ans = generic::SIN_K_PI_OVER_128_F128[idx];
+    Float128 ans = SIN_K_PI_OVER_128_F128[idx];
     if (kk & 128)
       ans.sign = Sign::NEG;
     return ans;
@@ -292,7 +285,7 @@ LLVM_LIBC_FUNCTION(double, tan, (double x)) {
   // https://github.com/llvm/llvm-project/issues/96452.
   return static_cast<double>(result);
 
-#endif // !LIBC_MATH_TAN_SKIP_ACCURATE_PASS
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/test/src/math/cos_test.cpp b/libc/test/src/math/cos_test.cpp
index 484d47fd3e96c4..e2d47917e545e0 100644
--- a/libc/test/src/math/cos_test.cpp
+++ b/libc/test/src/math/cos_test.cpp
@@ -50,8 +50,7 @@ TEST_F(LlvmLibcCosTest, TrickyInputs) {
       0x1.2b5fe88a9d8d5p+903,   0x1.f6d7518808571p+1023,
       -0x1.a880417b7b119p+1023, 0x1.00a33764a0a83p-7,
       0x1.fe81868fc47fep+1,     0x1.0da8cc189b47dp-10,
-      0x1.da1838053b866p+5,
-
+      0x1.da1838053b866p+5,     0x1.ffffffffe854bp199,
   };
   constexpr int N = sizeof(INPUTS) / sizeof(INPUTS[0]);
 
diff --git a/libc/test/src/math/sin_test.cpp b/libc/test/src/math/sin_test.cpp
index 60f6ef5c844630..d4c6bd416a4099 100644
--- a/libc/test/src/math/sin_test.cpp
+++ b/libc/test/src/math/sin_test.cpp
@@ -20,11 +20,13 @@ using LIBC_NAMESPACE::testing::tlog;
 
 TEST_F(LlvmLibcSinTest, TrickyInputs) {
   constexpr double INPUTS[] = {
-      0x1.940c877fb7dacp-7,    0x1.fffffffffdb6p24,    0x1.fd4da4ef37075p29,
-      0x1.b951f1572eba5p+31,   0x1.55202aefde314p+31,  0x1.85fc0f04c0128p101,
-      0x1.7776c2343ba4ep101,   0x1.678309fa50d58p110,  0x1.fffffffffef4ep199,
-      -0x1.ab514bfc61c76p+7,   -0x1.f7898d5a756ddp+2,  -0x1.f42fb19b5b9b2p-6,
-      0x1.5f09cad750ab1p+3,    -0x1.14823229799c2p+7,  -0x1.0285070f9f1bcp-5,
+      0x1.5f09cad750ab1p+3,    0x1.fff781921b61fp15,   -0x1.f635b70b92407p-21,
+      -0x1.3ecf146c39c0cp-20,  0x1.6ac5b262ca1ffp849,  0x1.6c6cbc45dc8dep5,
+      0x1.921fb5443p-7,        0x1.940c877fb7dacp-7,   0x1.fffffffffdb6p24,
+      0x1.fd4da4ef37075p29,    0x1.b951f1572eba5p+31,  0x1.55202aefde314p+31,
+      0x1.85fc0f04c0128p101,   0x1.7776c2343ba4ep101,  0x1.678309fa50d58p110,
+      0x1.fffffffffef4ep199,   -0x1.ab514bfc61c76p+7,  -0x1.f7898d5a756ddp+2,
+      -0x1.f42fb19b5b9b2p-6,   -0x1.14823229799c2p+7,  -0x1.0285070f9f1bcp-5,
       0x1.23f40dccdef72p+0,    0x1.43cf16358c9d7p+0,   0x1.addf3b9722265p+0,
       0x1.48ff1782ca91dp+8,    0x1.a211877de55dbp+4,   0x1.dcbfda0c7559ep+8,
       0x1.1ffb509f3db15p+5,    0x1.2345d1e090529p+5,   0x1.ae945054939c2p+10,
diff --git a/libc/test/src/math/tan_test.cpp b/libc/test/src/math/tan_test.cpp
index 1ca67afdaddf25..12dfc02bac111a 100644
--- a/libc/test/src/math/tan_test.cpp
+++ b/libc/test/src/math/tan_test.cpp
@@ -20,17 +20,20 @@ using LIBC_NAMESPACE::testing::tlog;
 
 TEST_F(LlvmLibcTanTest, TrickyInputs) {
   constexpr double INPUTS[] = {
-      0x1.d130383d17321p-27,   0x1.8000000000009p-23,  0x1.8000000000024p-22,
-      0x1.800000000009p-21,    0x1.20000000000f3p-20,  0x1.800000000024p-20,
-      0x1.e0000000001c2p-20,   0x1.00452f0e0134dp-13,  0x1.0da8cc189b47dp-10,
-      0x1.00a33764a0a83p-7,    0x1.911a18779813fp-7,   0x1.940c877fb7dacp-7,
-      0x1.f42fb19b5b9b2p-6,    0x1.0285070f9f1bcp-5,   0x1.89f0f5241255bp-2,
+      0x0.0000000000001p-1022, 0x1.d130383d17321p-27,  0x1.8000000000009p-23,
+      0x1.8000000000024p-22,   0x1.800000000009p-21,   0x1.20000000000f3p-20,
+      0x1.800000000024p-20,    0x1.e0000000001c2p-20,  0x1.00452f0e0134dp-13,
+      0x1.0da8cc189b47dp-10,   0x1.00a33764a0a83p-7,   0x1.911a18779813fp-7,
+      0x1.940c877fb7dacp-7,    0x1.f42fb19b5b9b2p-6,   0x1.0285070f9f1bcp-5,
+      0x1.90e833c6969c7p-4,    0x1.91d4b77c527eap-3,   0x1.89f0f5241255bp-2,
       0x1.6ca9ef729af76p-1,    0x1.23f40dccdef72p+0,   0x1.43cf16358c9d7p+0,
+      0x1.90f422b49115ep+0,    0x1.9220efee9fc7ep+0,   0x1.a224411cdebcep+0,
       0x1.addf3b9722265p+0,    0x1.ae78d360afa15p+0,   0x1.fe81868fc47fep+1,
-      0x1.e31b55306f22cp+2,    0x1.e639103a05997p+2,   0x1.f7898d5a756ddp+2,
-      0x1.1685973506319p+3,    0x1.5f09cad750ab1p+3,   0x1.aaf85537ea4c7p+3,
-      0x1.4f2b874135d27p+4,    0x1.13114266f9764p+4,   0x1.a211877de55dbp+4,
-      0x1.a5eece87e8606p+4,    0x1.a65d441ea6dcep+4,   0x1.045457ae3994p+5,
+      0x1.e31b55306f22cp+2,    0x1.e639103a05997p+2,   0x1.f69d074a3358fp+2,
+      0x1.f7898d5a756ddp+2,    0x1.1685973506319p+3,   0x1.5f09cad750ab1p+3,
+      0x1.aaf85537ea4c7p+3,    0x1.c50ddc4f513b4p+3,   0x1.13114266f9764p+4,
+      0x1.4f2b874135d27p+4,    0x1.a211877de55dbp+4,   0x1.a5eece87e8606p+4,
+      0x1.a65d441ea6dcep+4,    0x1.ab8c2f8ab5b7p+4,    0x1.045457ae3994p+5,
       0x1.1ffb509f3db15p+5,    0x1.2345d1e090529p+5,   0x1.c96e28eb679f8p+5,
       0x1.da1838053b866p+5,    0x1.be886d9c2324dp+6,   0x1.ab514bfc61c76p+7,
       0x1.14823229799c2p+7,    0x1.48ff1782ca91dp+8,   0x1.dcbfda0c7559ep+8,

From e01ae3920dd98779f2e58aa8f103ae3b6c6b5499 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 10 Oct 2024 20:56:25 -0700
Subject: [PATCH 116/177] [NFC][sanitizer] Use tid_t instead of int in
 ThreadLister (#111941)

---
 compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp | 4 ++--
 compiler-rt/lib/sanitizer_common/sanitizer_linux.h   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
index a4e58133c79f08..31750cf65ab6eb 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
@@ -1086,7 +1086,7 @@ ThreadLister::Result ThreadLister::ListThreads(
   }
 }
 
-const char *ThreadLister::LoadStatus(int tid) {
+const char *ThreadLister::LoadStatus(tid_t tid) {
   auto cleanup = at_scope_exit([&] {
     // Resize back to capacity if it is downsized by `ReadFileToVector`.
     buffer_.resize(buffer_.capacity());
@@ -1097,7 +1097,7 @@ const char *ThreadLister::LoadStatus(int tid) {
   return buffer_.data();
 }
 
-bool ThreadLister::IsAlive(int tid) {
+bool ThreadLister::IsAlive(tid_t tid) {
   // /proc/%d/task/%d/status uses same call to detect alive threads as
   // proc_task_readdir. See task_state implementation in Linux.
   static const char kPrefix[] = "\nPPid:";
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux.h b/compiler-rt/lib/sanitizer_common/sanitizer_linux.h
index 07d9528813b3fe..8b7874bb5a3494 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_linux.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux.h
@@ -103,10 +103,10 @@ class ThreadLister {
     Ok,
   };
   Result ListThreads(InternalMmapVector<tid_t> *threads);
-  const char *LoadStatus(int tid);
+  const char *LoadStatus(tid_t tid);
 
  private:
-  bool IsAlive(int tid);
+  bool IsAlive(tid_t tid);
 
   InternalScopedString task_path_;
   InternalScopedString status_path_;

From 59b2945c705671a676806b8985c3ade8d6088ab1 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 10 Oct 2024 20:57:34 -0700
Subject: [PATCH 117/177] [sanitizer] Fix ThreadLister::IsAlive (#111942)

'status_path_' must include `tid`.
Regression from #111909.
---
 compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp           | 3 ++-
 .../lib/sanitizer_common/tests/sanitizer_linux_test.cpp        | 3 +++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
index 31750cf65ab6eb..33107eb0b42993 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
@@ -1027,7 +1027,6 @@ bool internal_sigismember(__sanitizer_sigset_t *set, int signum) {
 // ThreadLister implementation.
 ThreadLister::ThreadLister(pid_t pid) : buffer_(4096) {
   task_path_.AppendF("/proc/%d/task", pid);
-  status_path_.AppendF("%s/status", task_path_.data());
 }
 
 ThreadLister::Result ThreadLister::ListThreads(
@@ -1087,6 +1086,8 @@ ThreadLister::Result ThreadLister::ListThreads(
 }
 
 const char *ThreadLister::LoadStatus(tid_t tid) {
+  status_path_.clear();
+  status_path_.AppendF("%s/%llu/status", task_path_.data(), tid);
   auto cleanup = at_scope_exit([&] {
     // Resize back to capacity if it is downsized by `ReadFileToVector`.
     buffer_.resize(buffer_.capacity());
diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_linux_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_linux_test.cpp
index b286ab72a5c795..ce4a40444cd496 100644
--- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_linux_test.cpp
+++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_linux_test.cpp
@@ -143,6 +143,9 @@ TEST_F(ThreadListerTest, ThreadListerSeesAllSpawnedThreads) {
   std::vector<tid_t> listed_tids = ReadTidsToVector(&thread_lister);
   ASSERT_TRUE(HasElement(listed_tids, self_tid));
   ASSERT_TRUE(Includes(listed_tids, tids_));
+
+  ASSERT_NE(nullptr, thread_lister.LoadStatus(self_tid));
+  for (auto tid : tids_) ASSERT_NE(nullptr, thread_lister.LoadStatus(tid));
 }
 
 TEST_F(ThreadListerTest, DoNotForgetThreads) {

From 36b07077673b6c639804160c6b31ce57718e13db Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Fri, 11 Oct 2024 05:58:25 +0200
Subject: [PATCH 118/177] [clang][bytecode] Return an lvalue path for dummy
 pointers (#111862)

Not doing this is wrong in general and we need to reject expressions
where it would matter differently.
---
 clang/lib/AST/ByteCode/Compiler.cpp | 16 ++++++++++------
 clang/lib/AST/ByteCode/Pointer.cpp  |  5 -----
 clang/test/AST/ByteCode/cxx1z.cpp   |  3 +++
 3 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index ba4c5600d613b0..0a3b38b0dc6e57 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -6006,6 +6006,9 @@ bool Compiler<Emitter>::visitDeclRef(const ValueDecl *D, const Expr *E) {
 
       return this->emitGetPtrParam(It->second.Offset, E);
     }
+
+    if (D->getType()->isReferenceType())
+      return false; // FIXME: Do we need to emit InvalidDeclRef?
   }
 
   // In case we need to re-visit a declaration.
@@ -6042,9 +6045,7 @@ bool Compiler<Emitter>::visitDeclRef(const ValueDecl *D, const Expr *E) {
         const auto typeShouldBeVisited = [&](QualType T) -> bool {
           if (T.isConstant(Ctx.getASTContext()))
             return true;
-          if (const auto *RT = T->getAs<ReferenceType>())
-            return RT->getPointeeType().isConstQualified();
-          return false;
+          return T->isReferenceType();
         };
 
         // DecompositionDecls are just proxies for us.
@@ -6060,9 +6061,12 @@ bool Compiler<Emitter>::visitDeclRef(const ValueDecl *D, const Expr *E) {
         // other words, we're evaluating the initializer, just to know if we can
         // evaluate the initializer.
         if (VD->isLocalVarDecl() && typeShouldBeVisited(VD->getType()) &&
-            VD->getInit() && !VD->getInit()->isValueDependent() &&
-            VD->evaluateValue())
-          return revisit(VD);
+            VD->getInit() && !VD->getInit()->isValueDependent()) {
+
+          if (VD->evaluateValue())
+            return revisit(VD);
+          return this->emitInvalidDeclRef(cast<DeclRefExpr>(E), E);
+        }
       }
     } else {
       if (const auto *VD = dyn_cast<VarDecl>(D);
diff --git a/clang/lib/AST/ByteCode/Pointer.cpp b/clang/lib/AST/ByteCode/Pointer.cpp
index a52f0e336ef298..75b00dcb2ab242 100644
--- a/clang/lib/AST/ByteCode/Pointer.cpp
+++ b/clang/lib/AST/ByteCode/Pointer.cpp
@@ -253,11 +253,6 @@ APValue Pointer::toAPValue(const ASTContext &ASTCtx) const {
     }
   }
 
-  // FIXME(perf): We compute the lvalue path above, but we can't supply it
-  // for dummy pointers (that causes crashes later in CheckConstantExpression).
-  if (isDummy())
-    Path.clear();
-
   // We assemble the LValuePath starting from the innermost pointer to the
   // outermost one. SO in a.b.c, the first element in Path will refer to
   // the field 'c', while later code expects it to refer to 'a'.
diff --git a/clang/test/AST/ByteCode/cxx1z.cpp b/clang/test/AST/ByteCode/cxx1z.cpp
index 2b5d215f016548..1a06597fa348fe 100644
--- a/clang/test/AST/ByteCode/cxx1z.cpp
+++ b/clang/test/AST/ByteCode/cxx1z.cpp
@@ -10,3 +10,6 @@ namespace Temp {
   A<int &, addr({}).n> c; // both-error {{reference to subobject of temporary object}}
   A<int *, &addr({}).n> d; // both-error {{pointer to subobject of temporary object}}
 }
+
+char arr[3];
+A<const char*, &arr[1]> d; // both-error {{refers to subobject '&arr[1]'}}

From 374886a360424d5f1c38359378a504408a9f64ed Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 10 Oct 2024 20:59:48 -0700
Subject: [PATCH 119/177] [NFC][sanitizer] Check suspended threads outside
 `ThreadSuspender::SuspendThread` (#111943)

Allows to distinguish failure from stopped threads.
---
 .../sanitizer_stoptheworld_linux_libcdep.cpp             | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp
index 6ebca965f6a334..ebe7b6f2ee8cc2 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp
@@ -137,10 +137,6 @@ class ThreadSuspender {
 };
 
 bool ThreadSuspender::SuspendThread(tid_t tid) {
-  // Are we already attached to this thread?
-  // Currently this check takes linear time, however the number of threads is
-  // usually small.
-  if (suspended_threads_list_.ContainsTid(tid)) return false;
   int pterrno;
   if (internal_iserror(internal_ptrace(PTRACE_ATTACH, tid, nullptr, nullptr),
                        &pterrno)) {
@@ -226,6 +222,11 @@ bool ThreadSuspender::SuspendAllThreads() {
         break;
     }
     for (tid_t tid : threads) {
+      // Are we already attached to this thread?
+      // Currently this check takes linear time, however the number of threads
+      // is usually small.
+      if (suspended_threads_list_.ContainsTid(tid))
+        continue;
       if (SuspendThread(tid))
         retry = true;
       else

From 36639af8adcd302e12f2962fd2b917d41323e5ae Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 10 Oct 2024 21:01:07 -0700
Subject: [PATCH 120/177] [NFC][sanitizer] VReport incomplete list (#111944)

---
 .../sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp    | 1 +
 1 file changed, 1 insertion(+)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp
index ebe7b6f2ee8cc2..945da99d41f4ea 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_stoptheworld_linux_libcdep.cpp
@@ -216,6 +216,7 @@ bool ThreadSuspender::SuspendAllThreads() {
         VReport(1, "Failed to list threads\n");
         return false;
       case ThreadLister::Incomplete:
+        VReport(1, "Incomplete list\n");
         retry = true;
         break;
       case ThreadLister::Ok:

From e556f0787cb9675a120fcfc91156edcd27047772 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 10 Oct 2024 21:03:29 -0700
Subject: [PATCH 121/177] [NFC][asan] Cleanup AsanThreadIdAndName ctor/init
 (#111923)

Co-authored-by: YunQiang Su <yunqiang@isrc.iscas.ac.cn>
---
 compiler-rt/lib/asan/asan_descriptions.cpp | 26 +++++++++-------------
 compiler-rt/lib/asan/asan_descriptions.h   |  2 --
 2 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/compiler-rt/lib/asan/asan_descriptions.cpp b/compiler-rt/lib/asan/asan_descriptions.cpp
index 674fe9c1e90be0..db87789aea86a0 100644
--- a/compiler-rt/lib/asan/asan_descriptions.cpp
+++ b/compiler-rt/lib/asan/asan_descriptions.cpp
@@ -20,24 +20,20 @@
 namespace __asan {
 
 AsanThreadIdAndName::AsanThreadIdAndName(AsanThreadContext *t) {
-  Init(t->tid, t->name);
-}
-
-AsanThreadIdAndName::AsanThreadIdAndName(u32 tid) {
-  if (tid == kInvalidTid) {
-    Init(tid, "");
-  } else {
-    asanThreadRegistry().CheckLocked();
-    AsanThreadContext *t = GetThreadContextByTidLocked(tid);
-    Init(tid, t->name);
+  if (!t) {
+    internal_snprintf(name, sizeof(name), "T-1");
+    return;
   }
+  int len = internal_snprintf(name, sizeof(name), "T%d", t->tid);
+  CHECK(((unsigned int)len) < sizeof(name));
+  if (internal_strlen(t->name))
+    internal_snprintf(&name[len], sizeof(name) - len, " (%s)", t->name);
 }
 
-void AsanThreadIdAndName::Init(u32 tid, const char *tname) {
-  int len = internal_snprintf(name, sizeof(name), "T%d", tid);
-  CHECK(((unsigned int)len) < sizeof(name));
-  if (tname[0] != '\0')
-    internal_snprintf(&name[len], sizeof(name) - len, " (%s)", tname);
+AsanThreadIdAndName::AsanThreadIdAndName(u32 tid)
+    : AsanThreadIdAndName(
+          tid == kInvalidTid ? nullptr : GetThreadContextByTidLocked(tid)) {
+  asanThreadRegistry().CheckLocked();
 }
 
 void DescribeThread(AsanThreadContext *context) {
diff --git a/compiler-rt/lib/asan/asan_descriptions.h b/compiler-rt/lib/asan/asan_descriptions.h
index 650e2eb9173ad5..a614f47d461bbd 100644
--- a/compiler-rt/lib/asan/asan_descriptions.h
+++ b/compiler-rt/lib/asan/asan_descriptions.h
@@ -35,8 +35,6 @@ class AsanThreadIdAndName {
   const char *c_str() const { return &name[0]; }
 
  private:
-  void Init(u32 tid, const char *tname);
-
   char name[128];
 };
 

From df4c91342577cd9a74f168ad8c98380538d5e7c4 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 10 Oct 2024 21:04:25 -0700
Subject: [PATCH 122/177] [asan] Print `unique_id` instead of `tid` (#111925)

Before the first reuse, after 2^32 threads
they are equal.
---
 compiler-rt/lib/asan/asan_descriptions.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/lib/asan/asan_descriptions.cpp b/compiler-rt/lib/asan/asan_descriptions.cpp
index db87789aea86a0..caec79313e22ff 100644
--- a/compiler-rt/lib/asan/asan_descriptions.cpp
+++ b/compiler-rt/lib/asan/asan_descriptions.cpp
@@ -24,7 +24,7 @@ AsanThreadIdAndName::AsanThreadIdAndName(AsanThreadContext *t) {
     internal_snprintf(name, sizeof(name), "T-1");
     return;
   }
-  int len = internal_snprintf(name, sizeof(name), "T%d", t->tid);
+  int len = internal_snprintf(name, sizeof(name), "T%llu", t->unique_id);
   CHECK(((unsigned int)len) < sizeof(name));
   if (internal_strlen(t->name))
     internal_snprintf(&name[len], sizeof(name) - len, " (%s)", t->name);

From 3cb4d20d5bcefd98454d0e181cd89f8ee6f16498 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 10 Oct 2024 21:19:49 -0700
Subject: [PATCH 123/177] [NFC][sanitizer] Simplify GetThreadLocked

Now we can pass `invalid tid`.
---
 compiler-rt/lib/sanitizer_common/sanitizer_thread_registry.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_thread_registry.h b/compiler-rt/lib/sanitizer_common/sanitizer_thread_registry.h
index 2c7e5c276fa1c7..bf492c17f7e107 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_thread_registry.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_thread_registry.h
@@ -101,7 +101,7 @@ class SANITIZER_MUTEX ThreadRegistry {
 
   // Should be guarded by ThreadRegistryLock.
   ThreadContextBase *GetThreadLocked(u32 tid) {
-    return threads_.empty() ? nullptr : threads_[tid];
+    return tid < threads_.size() ? threads_[tid] : nullptr;
   }
 
   u32 NumThreadsLocked() const { return threads_.size(); }

From bf81bd800fbcf1d11f149d897f55409e27ec59fb Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 10 Oct 2024 21:36:51 -0700
Subject: [PATCH 124/177] [ELF] Pass Ctx &

---
 lld/ELF/Arch/ARM.cpp       |  8 ++++++--
 lld/ELF/Arch/LoongArch.cpp |  6 +++---
 lld/ELF/Arch/X86.cpp       | 18 ++++++++++++++----
 lld/ELF/Arch/X86_64.cpp    | 18 ++++++++++++++----
 lld/ELF/LinkerScript.cpp   | 15 +++++++++------
 5 files changed, 46 insertions(+), 19 deletions(-)

diff --git a/lld/ELF/Arch/ARM.cpp b/lld/ELF/Arch/ARM.cpp
index 43fbbc8d49131a..013e90cde6f995 100644
--- a/lld/ELF/Arch/ARM.cpp
+++ b/lld/ELF/Arch/ARM.cpp
@@ -48,6 +48,10 @@ class ARM final : public TargetInfo {
   bool inBranchRange(RelType type, uint64_t src, uint64_t dst) const override;
   void relocate(uint8_t *loc, const Relocation &rel,
                 uint64_t val) const override;
+
+private:
+void encodeAluGroup(uint8_t *loc, const Relocation &rel, uint64_t val,
+                           int group, bool check) const;
 };
 enum class CodeState { Data = 0, Thumb = 2, Arm = 4 };
 } // namespace
@@ -534,8 +538,8 @@ static std::pair<uint32_t, uint32_t> getRemAndLZForGroup(unsigned group,
   return {rem, lz};
 }
 
-static void encodeAluGroup(uint8_t *loc, const Relocation &rel, uint64_t val,
-                           int group, bool check) {
+void ARM::encodeAluGroup(uint8_t *loc, const Relocation &rel, uint64_t val,
+                           int group, bool check) const {
   // ADD/SUB (immediate) add = bit23, sub = bit22
   // immediate field carries is a 12-bit modified immediate, made up of a 4-bit
   // even rotate right and an 8-bit immediate.
diff --git a/lld/ELF/Arch/LoongArch.cpp b/lld/ELF/Arch/LoongArch.cpp
index eca1d2fdc08caf..f16f8f0c8d5ce4 100644
--- a/lld/ELF/Arch/LoongArch.cpp
+++ b/lld/ELF/Arch/LoongArch.cpp
@@ -159,7 +159,7 @@ static bool isJirl(uint32_t insn) {
   return (insn & 0xfc000000) == JIRL;
 }
 
-static void handleUleb128(uint8_t *loc, uint64_t val) {
+static void handleUleb128(Ctx &ctx, uint8_t *loc, uint64_t val) {
   const uint32_t maxcount = 1 + 64 / 7;
   uint32_t count;
   const char *error = nullptr;
@@ -700,7 +700,7 @@ void LoongArch::relocate(uint8_t *loc, const Relocation &rel,
     write64le(loc, read64le(loc) + val);
     return;
   case R_LARCH_ADD_ULEB128:
-    handleUleb128(loc, val);
+    handleUleb128(ctx, loc, val);
     return;
   case R_LARCH_SUB6:
     *loc = (*loc & 0xc0) | ((*loc - val) & 0x3f);
@@ -718,7 +718,7 @@ void LoongArch::relocate(uint8_t *loc, const Relocation &rel,
     write64le(loc, read64le(loc) - val);
     return;
   case R_LARCH_SUB_ULEB128:
-    handleUleb128(loc, -val);
+    handleUleb128(ctx, loc, -val);
     return;
 
   case R_LARCH_MARK_LA:
diff --git a/lld/ELF/Arch/X86.cpp b/lld/ELF/Arch/X86.cpp
index 3314dcfc172f8c..4e574a520f1ff1 100644
--- a/lld/ELF/Arch/X86.cpp
+++ b/lld/ELF/Arch/X86.cpp
@@ -39,6 +39,12 @@ class X86 : public TargetInfo {
 
   RelExpr adjustTlsExpr(RelType type, RelExpr expr) const override;
   void relocateAlloc(InputSectionBase &sec, uint8_t *buf) const override;
+
+private:
+  void relaxTlsGdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
+  void relaxTlsGdToIe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
+  void relaxTlsLdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
+  void relaxTlsIeToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
 };
 } // namespace
 
@@ -344,7 +350,8 @@ void X86::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const {
   }
 }
 
-static void relaxTlsGdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) {
+void X86::relaxTlsGdToLe(uint8_t *loc, const Relocation &rel,
+                         uint64_t val) const {
   if (rel.type == R_386_TLS_GD) {
     // Convert (loc[-2] == 0x04)
     //   leal x@tlsgd(, %ebx, 1), %eax
@@ -379,7 +386,8 @@ static void relaxTlsGdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) {
   }
 }
 
-static void relaxTlsGdToIe(uint8_t *loc, const Relocation &rel, uint64_t val) {
+void X86::relaxTlsGdToIe(uint8_t *loc, const Relocation &rel,
+                         uint64_t val) const {
   if (rel.type == R_386_TLS_GD) {
     // Convert (loc[-2] == 0x04)
     //   leal x@tlsgd(, %ebx, 1), %eax
@@ -413,7 +421,8 @@ static void relaxTlsGdToIe(uint8_t *loc, const Relocation &rel, uint64_t val) {
 
 // In some conditions, relocations can be optimized to avoid using GOT.
 // This function does that for Initial Exec to Local Exec case.
-static void relaxTlsIeToLe(uint8_t *loc, const Relocation &rel, uint64_t val) {
+void X86::relaxTlsIeToLe(uint8_t *loc, const Relocation &rel,
+                         uint64_t val) const {
   // Ulrich's document section 6.2 says that @gotntpoff can
   // be used with MOVL or ADDL instructions.
   // @indntpoff is similar to @gotntpoff, but for use in
@@ -450,7 +459,8 @@ static void relaxTlsIeToLe(uint8_t *loc, const Relocation &rel, uint64_t val) {
   write32le(loc, val);
 }
 
-static void relaxTlsLdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) {
+void X86::relaxTlsLdToLe(uint8_t *loc, const Relocation &rel,
+                         uint64_t val) const {
   if (rel.type == R_386_TLS_LDO_32) {
     write32le(loc, val);
     return;
diff --git a/lld/ELF/Arch/X86_64.cpp b/lld/ELF/Arch/X86_64.cpp
index fbf1076fc71e52..121b7d9929b209 100644
--- a/lld/ELF/Arch/X86_64.cpp
+++ b/lld/ELF/Arch/X86_64.cpp
@@ -50,6 +50,12 @@ class X86_64 : public TargetInfo {
   bool deleteFallThruJmpInsn(InputSection &is, InputFile *file,
                              InputSection *nextIS) const override;
   bool relaxOnce(int pass) const override;
+
+private:
+  void relaxTlsGdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
+  void relaxTlsGdToIe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
+  void relaxTlsLdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
+  void relaxTlsIeToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
 };
 } // namespace
 
@@ -460,7 +466,8 @@ RelType X86_64::getDynRel(RelType type) const {
   return R_X86_64_NONE;
 }
 
-static void relaxTlsGdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) {
+void X86_64::relaxTlsGdToLe(uint8_t *loc, const Relocation &rel,
+                            uint64_t val) const {
   if (rel.type == R_X86_64_TLSGD) {
     // Convert
     //   .byte 0x66
@@ -500,7 +507,8 @@ static void relaxTlsGdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) {
   }
 }
 
-static void relaxTlsGdToIe(uint8_t *loc, const Relocation &rel, uint64_t val) {
+void X86_64::relaxTlsGdToIe(uint8_t *loc, const Relocation &rel,
+                            uint64_t val) const {
   if (rel.type == R_X86_64_TLSGD) {
     // Convert
     //   .byte 0x66
@@ -541,7 +549,8 @@ static void relaxTlsGdToIe(uint8_t *loc, const Relocation &rel, uint64_t val) {
 
 // In some conditions, R_X86_64_GOTTPOFF relocation can be optimized to
 // R_X86_64_TPOFF32 so that it does not use GOT.
-static void relaxTlsIeToLe(uint8_t *loc, const Relocation &, uint64_t val) {
+void X86_64::relaxTlsIeToLe(uint8_t *loc, const Relocation &rel,
+                            uint64_t val) const {
   uint8_t *inst = loc - 3;
   uint8_t reg = loc[-1] >> 3;
   uint8_t *regSlot = loc - 1;
@@ -582,7 +591,8 @@ static void relaxTlsIeToLe(uint8_t *loc, const Relocation &, uint64_t val) {
   write32le(loc, val + 4);
 }
 
-static void relaxTlsLdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) {
+void X86_64::relaxTlsLdToLe(uint8_t *loc, const Relocation &rel,
+                            uint64_t val) const {
   const uint8_t inst[] = {
       0x66, 0x66,                                           // .word 0x6666
       0x66,                                                 // .byte 0x66
diff --git a/lld/ELF/LinkerScript.cpp b/lld/ELF/LinkerScript.cpp
index f3f95ec589bd82..e9a637bac4e9bd 100644
--- a/lld/ELF/LinkerScript.cpp
+++ b/lld/ELF/LinkerScript.cpp
@@ -479,7 +479,7 @@ static void sortSections(MutableArrayRef<InputSectionBase *> vec,
 //    --sort-section is handled as an inner SORT command.
 // 3. If one SORT command is given, and if it is SORT_NONE, don't sort.
 // 4. If no SORT command is given, sort according to --sort-section.
-static void sortInputSections(MutableArrayRef<InputSectionBase *> vec,
+static void sortInputSections(Ctx &ctx, MutableArrayRef<InputSectionBase *> vec,
                               SortSectionPolicy outer,
                               SortSectionPolicy inner) {
   if (outer == SortSectionPolicy::None)
@@ -517,6 +517,7 @@ LinkerScript::computeInputSections(const InputSectionDescription *cmd,
       for (size_t i = begin; i != end; ++i)
         ret[i] = sections[indexes[i]];
       sortInputSections(
+          ctx,
           MutableArrayRef<InputSectionBase *>(ret).slice(begin, end - begin),
           ctx.arg.sortSection, SortSectionPolicy::None);
     };
@@ -584,6 +585,7 @@ LinkerScript::computeInputSections(const InputSectionDescription *cmd,
       // ret[sizeBeforeCurrPat,ret.size()) are already in the input order, so we
       // just sort by sortOuter and sortInner.
       sortInputSections(
+          ctx,
           MutableArrayRef<InputSectionBase *>(ret).slice(sizeBeforeCurrPat),
           pat.sortOuter, pat.sortInner);
       sizeAfterPrevSort = ret.size();
@@ -865,7 +867,8 @@ static OutputDesc *createSection(InputSectionBase *isec, StringRef outsecName) {
   return osd;
 }
 
-static OutputDesc *addInputSec(StringMap<TinyPtrVector<OutputSection *>> &map,
+static OutputDesc *addInputSec(Ctx &ctx,
+                               StringMap<TinyPtrVector<OutputSection *>> &map,
                                InputSectionBase *isec, StringRef outsecName) {
   // Sections with SHT_GROUP or SHF_GROUP attributes reach here only when the -r
   // option is given. A section with SHT_GROUP defines a "section group", and
@@ -983,7 +986,7 @@ void LinkerScript::addOrphanSections() {
       } else if (OutputSection *sec = findByName(sectionCommands, name)) {
         sec->recordSection(s);
       } else {
-        if (OutputDesc *osd = addInputSec(map, s, name))
+        if (OutputDesc *osd = addInputSec(ctx, map, s, name))
           v.push_back(osd);
         assert(isa<MergeInputSection>(s) ||
                s->getOutputSection()->sectionIndex == UINT32_MAX);
@@ -1114,7 +1117,7 @@ LinkerScript::findMemoryRegion(OutputSection *sec, MemoryRegion *hint) {
   return {nullptr, nullptr};
 }
 
-static OutputSection *findFirstSection(PhdrEntry *load) {
+static OutputSection *findFirstSection(Ctx &ctx, PhdrEntry *load) {
   for (OutputSection *sec : ctx.outputSections)
     if (sec->ptLoad == load)
       return sec;
@@ -1187,7 +1190,7 @@ bool LinkerScript::assignOffsets(OutputSection *sec) {
 
   // Propagate state->lmaOffset to the first "non-header" section.
   if (PhdrEntry *l = sec->ptLoad)
-    if (sec == findFirstSection(l))
+    if (sec == findFirstSection(ctx, l))
       l->lmaOffset = state->lmaOffset;
 
   // We can call this method multiple times during the creation of
@@ -1462,7 +1465,7 @@ void LinkerScript::allocateHeaders(SmallVector<PhdrEntry *, 0> &phdrs) {
 
   ctx.out.elfHeader->ptLoad = nullptr;
   ctx.out.programHeaders->ptLoad = nullptr;
-  firstPTLoad->firstSec = findFirstSection(firstPTLoad);
+  firstPTLoad->firstSec = findFirstSection(ctx, firstPTLoad);
 
   llvm::erase_if(phdrs,
                  [](const PhdrEntry *e) { return e->p_type == PT_PHDR; });

From 15de239406bfc0a1dfbd0640490c4bd5d1e0ac33 Mon Sep 17 00:00:00 2001
From: Serge Pavlov <sepavloff@gmail.com>
Date: Fri, 11 Oct 2024 12:09:10 +0700
Subject: [PATCH 125/177] [IR] Allow MDString in operand bundles (#110805)

This change implements support of metadata strings in operand bundle
values. It makes possible calls like:

    call void @some_func(i32 %x) [ "foo"(i32 42, metadata !"abc") ]

It requires some extension of the bitcode serialization. As SSA values
and metadata are stored in different tables, there must be a way to
distinguish them during deserialization. It is implemented by putting a
special marker before the metadata index. The marker cannot be treated
as a reference to any SSA value, so it unambiguously identifies
metadata. It allows extending the bitcode serialization without breaking
compatibility.

Metadata as operand bundle values are intended to be used in
floating-point function calls. They would represent the same information
as now is passed by the constrained intrinsic arguments.
---
 llvm/docs/LangRef.rst                     |  6 +++---
 llvm/docs/ReleaseNotes.md                 |  2 ++
 llvm/include/llvm/Bitcode/LLVMBitCodes.h  |  3 +++
 llvm/lib/AsmParser/LLParser.cpp           |  8 +++++++-
 llvm/lib/Bitcode/Reader/BitcodeReader.cpp | 21 ++++++++++++++++++--
 llvm/lib/Bitcode/Writer/BitcodeWriter.cpp | 17 +++++++++++++++-
 llvm/test/Bitcode/compatibility.ll        |  8 ++++++++
 llvm/test/Bitcode/operand-bundles.ll      | 24 +++++++++++++++++++++++
 8 files changed, 82 insertions(+), 7 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 0c7279de06cd68..a330b804930326 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -2666,8 +2666,8 @@ are grouped into a single :ref:`attribute group <attrgrp>`.
 Operand Bundles
 ---------------
 
-Operand bundles are tagged sets of SSA values that can be associated
-with certain LLVM instructions (currently only ``call`` s and
+Operand bundles are tagged sets of SSA values or metadata strings that can be
+associated with certain LLVM instructions (currently only ``call`` s and
 ``invoke`` s).  In a way they are like metadata, but dropping them is
 incorrect and will change program semantics.
 
@@ -2675,7 +2675,7 @@ Syntax::
 
     operand bundle set ::= '[' operand bundle (, operand bundle )* ']'
     operand bundle ::= tag '(' [ bundle operand ] (, bundle operand )* ')'
-    bundle operand ::= SSA value
+    bundle operand ::= SSA value | metadata string
     tag ::= string constant
 
 Operand bundles are **not** part of a function's signature, and a
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 8ac5900a7e532e..dcdd7a25c7fbee 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -88,6 +88,8 @@ Changes to the LLVM IR
   * `llvm.nvvm.ptr.shared.to.gen`
   * `llvm.nvvm.ptr.constant.to.gen`
   * `llvm.nvvm.ptr.local.to.gen`
+  
+* Operand bundle values can now be metadata strings.
 
 Changes to LLVM infrastructure
 ------------------------------
diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index cbd92fd52fc75a..ba2efee9414218 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -529,6 +529,9 @@ enum PossiblyExactOperatorOptionalFlags { PEO_EXACT = 0 };
 /// PossiblyDisjointInst's SubclassOptionalData contents.
 enum PossiblyDisjointInstOptionalFlags { PDI_DISJOINT = 0 };
 
+/// Mark to distinguish metadata from value in an operator bundle.
+enum MetadataOperandBundleValueMarker { OB_METADATA = 0x80000000 };
+
 /// GetElementPtrOptionalFlags - Flags for serializing
 /// GEPOperator's SubclassOptionalData contents.
 enum GetElementPtrOptionalFlags {
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 9f2ef2e6a9311e..c3b4a8235ce637 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -3202,8 +3202,14 @@ bool LLParser::parseOptionalOperandBundles(
 
       Type *Ty = nullptr;
       Value *Input = nullptr;
-      if (parseType(Ty) || parseValue(Ty, Input, PFS))
+      if (parseType(Ty))
         return true;
+      if (Ty->isMetadataTy()) {
+        if (parseMetadataAsValue(Input, PFS))
+          return true;
+      } else if (parseValue(Ty, Input, PFS)) {
+        return true;
+      }
       Inputs.push_back(Input);
     }
 
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 6f997510b03609..8ee93253bc2447 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -792,6 +792,24 @@ class BitcodeReader : public BitcodeReaderBase, public GVMaterializer {
     return ResVal == nullptr;
   }
 
+  bool getValueOrMetadata(const SmallVectorImpl<uint64_t> &Record,
+                          unsigned &Slot, unsigned InstNum, Value *&ResVal,
+                          BasicBlock *ConstExprInsertBB) {
+    if (Slot == Record.size())
+      return true;
+    unsigned ValID = Record[Slot++];
+    if (ValID != bitc::OB_METADATA) {
+      unsigned TypeId;
+      return getValueTypePair(Record, --Slot, InstNum, ResVal, TypeId,
+                              ConstExprInsertBB);
+    }
+    if (Slot == Record.size())
+      return true;
+    unsigned ValNo = InstNum - (unsigned)Record[Slot++];
+    ResVal = MetadataAsValue::get(Context, getFnMetadataByID(ValNo));
+    return false;
+  }
+
   /// Read a value out of the specified record from slot 'Slot'. Increment Slot
   /// past the number of slots used by the value in the record. Return true if
   /// there is an error.
@@ -6767,8 +6785,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       unsigned OpNum = 1;
       while (OpNum != Record.size()) {
         Value *Op;
-        unsigned OpTypeID;
-        if (getValueTypePair(Record, OpNum, NextValueNo, Op, OpTypeID, CurBB))
+        if (getValueOrMetadata(Record, OpNum, NextValueNo, Op, CurBB))
           return error("Invalid record");
         Inputs.push_back(Op);
       }
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index d9086bfebbd2a9..bec0caef58afa8 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -395,6 +395,8 @@ class ModuleBitcodeWriter : public ModuleBitcodeWriterBase {
   void writeModuleConstants();
   bool pushValueAndType(const Value *V, unsigned InstID,
                         SmallVectorImpl<unsigned> &Vals);
+  bool pushValueOrMetadata(const Value *V, unsigned InstID,
+                           SmallVectorImpl<unsigned> &Vals);
   void writeOperandBundles(const CallBase &CB, unsigned InstID);
   void pushValue(const Value *V, unsigned InstID,
                  SmallVectorImpl<unsigned> &Vals);
@@ -2931,6 +2933,19 @@ bool ModuleBitcodeWriter::pushValueAndType(const Value *V, unsigned InstID,
   return false;
 }
 
+bool ModuleBitcodeWriter::pushValueOrMetadata(const Value *V, unsigned InstID,
+                                              SmallVectorImpl<unsigned> &Vals) {
+  bool IsMetadata = V->getType()->isMetadataTy();
+  if (IsMetadata) {
+    Vals.push_back(bitc::OB_METADATA);
+    Metadata *MD = cast<MetadataAsValue>(V)->getMetadata();
+    unsigned ValID = VE.getMetadataID(MD);
+    Vals.push_back(InstID - ValID);
+    return false;
+  }
+  return pushValueAndType(V, InstID, Vals);
+}
+
 void ModuleBitcodeWriter::writeOperandBundles(const CallBase &CS,
                                               unsigned InstID) {
   SmallVector<unsigned, 64> Record;
@@ -2941,7 +2956,7 @@ void ModuleBitcodeWriter::writeOperandBundles(const CallBase &CS,
     Record.push_back(C.getOperandBundleTagID(Bundle.getTagName()));
 
     for (auto &Input : Bundle.Inputs)
-      pushValueAndType(Input, InstID, Record);
+      pushValueOrMetadata(Input, InstID, Record);
 
     Stream.EmitRecord(bitc::FUNC_CODE_OPERAND_BUNDLE, Record);
     Record.clear();
diff --git a/llvm/test/Bitcode/compatibility.ll b/llvm/test/Bitcode/compatibility.ll
index a1b2370a87b821..280c3a99d7535f 100644
--- a/llvm/test/Bitcode/compatibility.ll
+++ b/llvm/test/Bitcode/compatibility.ll
@@ -1327,6 +1327,14 @@ continue:
   ret i32 0
 }
 
+declare void @instructions.bundles.callee(i32)
+define void @instructions.bundles.metadata(i32 %x) {
+entry:
+  call void @instructions.bundles.callee(i32 %x) [ "foo"(i32 42, metadata !"abc"), "bar"(metadata !"abcde", metadata !"qwerty") ]
+; CHECK: call void @instructions.bundles.callee(i32 %x) [ "foo"(i32 42, metadata !"abc"), "bar"(metadata !"abcde", metadata !"qwerty") ]
+  ret void
+}
+
 ; Instructions -- Unary Operations
 define void @instructions.unops(double %op1) {
   fneg double %op1
diff --git a/llvm/test/Bitcode/operand-bundles.ll b/llvm/test/Bitcode/operand-bundles.ll
index ab28cffd84aa29..a8e086f784c6cf 100644
--- a/llvm/test/Bitcode/operand-bundles.ll
+++ b/llvm/test/Bitcode/operand-bundles.ll
@@ -56,6 +56,13 @@ define void @f4(i32* %ptr) {
   ret void
 }
 
+define void @f5(i32 %x) {
+entry:
+  call void @callee1(i32 10, i32 %x) [ "foo"(i32 42, metadata !"abc"), "bar"(metadata !"abcde", metadata !"qwerty") ]
+; CHECK: call void @callee1(i32 10, i32 %x) [ "foo"(i32 42, metadata !"abc"), "bar"(metadata !"abcde", metadata !"qwerty") ]
+  ret void
+}
+
 ; Invoke versions of the above tests:
 
 
@@ -150,3 +157,20 @@ exception:
 normal:
   ret void
 }
+
+define void @g5(ptr %ptr) personality i8 3 {
+entry:
+  %l = load i32, ptr %ptr, align 4
+  %x = add i32 42, 1
+  invoke void @callee1(i32 10, i32 %x) [ "foo"(i32 42, metadata !"abc"), "bar"(metadata !"abcde", metadata !"qwerty") ]
+          to label %normal unwind label %exception
+; CHECK:   invoke void @callee1(i32 10, i32 %x) [ "foo"(i32 42, metadata !"abc"), "bar"(metadata !"abcde", metadata !"qwerty") ]
+
+exception:                                        ; preds = %entry
+  %cleanup = landingpad i8
+          cleanup
+  br label %normal
+
+normal:                                           ; preds = %exception, %entry
+  ret void
+}

From c22588c7cdc5a82afd825ce90f21f922dedee98b Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 10 Oct 2024 22:15:10 -0700
Subject: [PATCH 126/177] [ELF] Move InputSectionBase::file to SectionBase

... and add getCtx (file->ctx). This allows InputSectionBase and
OutputSection to access ctx without taking an extra function argument.
---
 lld/ELF/EhFrame.cpp        |  2 +-
 lld/ELF/InputFiles.h       |  4 +++-
 lld/ELF/InputSection.cpp   | 10 ++++++----
 lld/ELF/InputSection.h     | 22 ++++++++++++----------
 lld/ELF/OutputSections.cpp |  5 +++--
 lld/ELF/OutputSections.h   |  3 ++-
 6 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/lld/ELF/EhFrame.cpp b/lld/ELF/EhFrame.cpp
index d2d0e62e97ec45..f4c788fe610ae6 100644
--- a/lld/ELF/EhFrame.cpp
+++ b/lld/ELF/EhFrame.cpp
@@ -119,7 +119,7 @@ void EhReader::skipAugP() {
   uint8_t enc = readByte();
   if ((enc & 0xf0) == DW_EH_PE_aligned)
     failOn(d.data() - 1, "DW_EH_PE_aligned encoding is not supported");
-  size_t size = getAugPSize(ctx, enc);
+  size_t size = getAugPSize(isec->getCtx(), enc);
   if (size == 0)
     failOn(d.data() - 1, "unknown FDE encoding");
   if (size >= d.size())
diff --git a/lld/ELF/InputFiles.h b/lld/ELF/InputFiles.h
index 0b54f92d1a2669..f80413b215047d 100644
--- a/lld/ELF/InputFiles.h
+++ b/lld/ELF/InputFiles.h
@@ -48,8 +48,10 @@ void parseFiles(Ctx &, const std::vector<InputFile *> &files);
 
 // The root class of input files.
 class InputFile {
-protected:
+public:
   Ctx &ctx;
+
+protected:
   std::unique_ptr<Symbol *[]> symbols;
   uint32_t numSymbols = 0;
   SmallVector<InputSectionBase *, 0> sections;
diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index 0885815a22a14a..90716f4f3675cc 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -52,9 +52,9 @@ InputSectionBase::InputSectionBase(InputFile *file, uint64_t flags,
                                    uint32_t link, uint32_t info,
                                    uint32_t addralign, ArrayRef<uint8_t> data,
                                    StringRef name, Kind sectionKind)
-    : SectionBase(sectionKind, name, flags, entsize, addralign, type, info,
-                  link),
-      file(file), content_(data.data()), size(data.size()) {
+    : SectionBase(sectionKind, file, name, flags, entsize, addralign, type,
+                  info, link),
+      content_(data.data()), size(data.size()) {
   // In order to reduce memory allocation, we assume that mergeable
   // sections are smaller than 4 GiB, which is not an unreasonable
   // assumption as of 2017.
@@ -88,7 +88,7 @@ template <class ELFT>
 InputSectionBase::InputSectionBase(ObjFile<ELFT> &file,
                                    const typename ELFT::Shdr &hdr,
                                    StringRef name, Kind sectionKind)
-    : InputSectionBase(&file, getFlags(ctx, hdr.sh_flags), hdr.sh_type,
+    : InputSectionBase(&file, getFlags(file.ctx, hdr.sh_flags), hdr.sh_type,
                        hdr.sh_entsize, hdr.sh_link, hdr.sh_info,
                        hdr.sh_addralign, getSectionContents(file, hdr), name,
                        sectionKind) {
@@ -185,6 +185,8 @@ RelsOrRelas<ELFT> InputSectionBase::relsOrRelas(bool supportsCrel) const {
   return ret;
 }
 
+Ctx &SectionBase::getCtx() const { return file->ctx; }
+
 uint64_t SectionBase::getOffset(uint64_t offset) const {
   switch (kind()) {
   case Output: {
diff --git a/lld/ELF/InputSection.h b/lld/ELF/InputSection.h
index bff9ec324d9bc5..8f69a957e11d7a 100644
--- a/lld/ELF/InputSection.h
+++ b/lld/ELF/InputSection.h
@@ -78,6 +78,12 @@ class SectionBase {
 
   uint8_t partition = 1;
   uint32_t type;
+
+  // The file which contains this section. For InputSectionBase, its dynamic
+  // type is usually ObjFile<ELFT>, but may be an InputFile of InternalKind
+  // (for a synthetic section).
+  InputFile *file;
+
   StringRef name;
 
   // The 1-indexed partition that this section is assigned to by the garbage
@@ -92,6 +98,7 @@ class SectionBase {
   uint32_t link;
   uint32_t info;
 
+  Ctx &getCtx() const;
   OutputSection *getOutputSection();
   const OutputSection *getOutputSection() const {
     return const_cast<SectionBase *>(this)->getOutputSection();
@@ -108,12 +115,12 @@ class SectionBase {
   void markDead() { partition = 0; }
 
 protected:
-  constexpr SectionBase(Kind sectionKind, StringRef name, uint64_t flags,
-                        uint32_t entsize, uint32_t addralign, uint32_t type,
-                        uint32_t info, uint32_t link)
+  constexpr SectionBase(Kind sectionKind, InputFile *file, StringRef name,
+                        uint64_t flags, uint32_t entsize, uint32_t addralign,
+                        uint32_t type, uint32_t info, uint32_t link)
       : sectionKind(sectionKind), bss(false), keepUnique(false), type(type),
-        name(name), flags(flags), addralign(addralign), entsize(entsize),
-        link(link), info(info) {}
+        file(file), name(name), flags(flags), addralign(addralign),
+        entsize(entsize), link(link), info(info) {}
 };
 
 struct SymbolAnchor {
@@ -150,11 +157,6 @@ class InputSectionBase : public SectionBase {
     return s->kind() != Output && s->kind() != Class;
   }
 
-  // The file which contains this section. Its dynamic type is usually
-  // ObjFile<ELFT>, but may be an InputFile of InternalKind (for a synthetic
-  // section).
-  InputFile *file;
-
   // Input sections are part of an output section. Special sections
   // like .eh_frame and merge sections are first combined into a
   // synthetic section that is then added to an output section. In all
diff --git a/lld/ELF/OutputSections.cpp b/lld/ELF/OutputSections.cpp
index 408dbdc43d5481..864c30ca790508 100644
--- a/lld/ELF/OutputSections.cpp
+++ b/lld/ELF/OutputSections.cpp
@@ -66,8 +66,9 @@ void OutputSection::writeHeaderTo(typename ELFT::Shdr *shdr) {
 }
 
 OutputSection::OutputSection(StringRef name, uint32_t type, uint64_t flags)
-    : SectionBase(Output, name, flags, /*Entsize*/ 0, /*Alignment*/ 1, type,
-                  /*Info*/ 0, /*Link*/ 0) {}
+    : SectionBase(Output, ctx.internalFile, name, flags, /*entsize=*/0,
+                  /*addralign=*/1, type,
+                  /*info=*/0, /*link=*/0) {}
 
 // We allow sections of types listed below to merged into a
 // single progbits section. This is typically done by linker
diff --git a/lld/ELF/OutputSections.h b/lld/ELF/OutputSections.h
index 904206b20bc1cb..11977507e9268e 100644
--- a/lld/ELF/OutputSections.h
+++ b/lld/ELF/OutputSections.h
@@ -150,7 +150,8 @@ struct SectionClass final : public SectionBase {
   SmallVector<InputSectionDescription *, 0> commands;
   bool assigned = false;
 
-  SectionClass(StringRef name) : SectionBase(Class, name, 0, 0, 0, 0, 0, 0) {}
+  SectionClass(StringRef name)
+      : SectionBase(Class, nullptr, name, 0, 0, 0, 0, 0, 0) {}
   static bool classof(const SectionBase *s) { return s->kind() == Class; }
 };
 

From e018f550d0c40bd99294cdd943c23bbec3804ace Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 10 Oct 2024 22:22:25 -0700
Subject: [PATCH 127/177] [ELF] Pass Ctx &

---
 lld/ELF/ARMErrataFix.cpp | 2 +-
 lld/ELF/Arch/Mips.cpp    | 4 ++--
 lld/ELF/DWARF.cpp        | 1 +
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/lld/ELF/ARMErrataFix.cpp b/lld/ELF/ARMErrataFix.cpp
index 839ab2b074bdd1..5818772bf19d14 100644
--- a/lld/ELF/ARMErrataFix.cpp
+++ b/lld/ELF/ARMErrataFix.cpp
@@ -417,7 +417,7 @@ void ARMErr657417Patcher::insertPatches(
 // isec so the branch we are patching always goes forwards.
 static void implementPatch(ScanResult sr, InputSection *isec,
                            std::vector<Patch657417Section *> &patches) {
-
+  Ctx &ctx = isec->getCtx();
   log("detected cortex-a8-657419 erratum sequence starting at " +
       utohexstr(isec->getVA(sr.off)) + " in unpatched output.");
   Patch657417Section *psec;
diff --git a/lld/ELF/Arch/Mips.cpp b/lld/ELF/Arch/Mips.cpp
index 975fa9ead762d7..6313ac8ca4fb9a 100644
--- a/lld/ELF/Arch/Mips.cpp
+++ b/lld/ELF/Arch/Mips.cpp
@@ -480,7 +480,7 @@ int64_t MIPS<ELFT>::getImplicitAddend(const uint8_t *buf, RelType type) const {
 }
 
 static std::pair<uint32_t, uint64_t>
-calculateMipsRelChain(uint8_t *loc, RelType type, uint64_t val) {
+calculateMipsRelChain(Ctx &ctx, uint8_t *loc, RelType type, uint64_t val) {
   // MIPS N64 ABI packs multiple relocations into the single relocation
   // record. In general, all up to three relocations can have arbitrary
   // types. In fact, Clang and GCC uses only a few combinations. For now,
@@ -572,7 +572,7 @@ void MIPS<ELFT>::relocate(uint8_t *loc, const Relocation &rel,
   RelType type = rel.type;
 
   if (ELFT::Is64Bits || ctx.arg.mipsN32Abi)
-    std::tie(type, val) = calculateMipsRelChain(loc, type, val);
+    std::tie(type, val) = calculateMipsRelChain(ctx, loc, type, val);
 
   // Detect cross-mode jump/branch and fix instruction.
   val = fixupCrossModeJump<ELFT>(loc, type, val);
diff --git a/lld/ELF/DWARF.cpp b/lld/ELF/DWARF.cpp
index 133e66baabe2de..8e4740919a481d 100644
--- a/lld/ELF/DWARF.cpp
+++ b/lld/ELF/DWARF.cpp
@@ -112,6 +112,7 @@ LLDDwarfObj<ELFT>::findAux(const InputSectionBase &sec, uint64_t pos,
   const RelTy &rel = *it;
 
   const ObjFile<ELFT> *file = sec.getFile<ELFT>();
+  Ctx &ctx = sec.getCtx();
   uint32_t symIndex = rel.getSymbol(ctx.arg.isMips64EL);
   const typename ELFT::Sym &sym = file->template getELFSyms<ELFT>()[symIndex];
   uint32_t secIndex = file->getSectionIndex(sym);

From 25cda9e069bc5948f38dde0d2e07814a7bf3fc71 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 10 Oct 2024 23:07:02 -0700
Subject: [PATCH 128/177] [ELF] Pass Ctx & to SyntheticSection

---
 lld/ELF/AArch64ErrataFix.cpp  |   2 +-
 lld/ELF/ARMErrataFix.cpp      |   2 +-
 lld/ELF/Arch/ARM.cpp          |   9 +-
 lld/ELF/Arch/RISCV.cpp        |   7 +-
 lld/ELF/Driver.cpp            |   2 +-
 lld/ELF/InputSection.h        |   2 +-
 lld/ELF/OutputSections.cpp    |   4 +-
 lld/ELF/Relocations.cpp       |   6 +-
 lld/ELF/SyntheticSections.cpp | 305 ++++++++++++++++++----------------
 lld/ELF/SyntheticSections.h   | 121 +++++++-------
 10 files changed, 240 insertions(+), 220 deletions(-)

diff --git a/lld/ELF/AArch64ErrataFix.cpp b/lld/ELF/AArch64ErrataFix.cpp
index 19db4295d46ed5..a5129c58da13d9 100644
--- a/lld/ELF/AArch64ErrataFix.cpp
+++ b/lld/ELF/AArch64ErrataFix.cpp
@@ -393,7 +393,7 @@ class elf::Patch843419Section final : public SyntheticSection {
 };
 
 Patch843419Section::Patch843419Section(Ctx &ctx, InputSection *p, uint64_t off)
-    : SyntheticSection(SHF_ALLOC | SHF_EXECINSTR, SHT_PROGBITS, 4,
+    : SyntheticSection(ctx, SHF_ALLOC | SHF_EXECINSTR, SHT_PROGBITS, 4,
                        ".text.patch"),
       patchee(p), patcheeOffset(off) {
   this->parent = p->getParent();
diff --git a/lld/ELF/ARMErrataFix.cpp b/lld/ELF/ARMErrataFix.cpp
index 5818772bf19d14..57df542e57ec48 100644
--- a/lld/ELF/ARMErrataFix.cpp
+++ b/lld/ELF/ARMErrataFix.cpp
@@ -136,7 +136,7 @@ static bool is32bitBranch(uint32_t instr) {
 
 Patch657417Section::Patch657417Section(Ctx &ctx, InputSection *p, uint64_t off,
                                        uint32_t instr, bool isARM)
-    : SyntheticSection(SHF_ALLOC | SHF_EXECINSTR, SHT_PROGBITS, 4,
+    : SyntheticSection(ctx, SHF_ALLOC | SHF_EXECINSTR, SHT_PROGBITS, 4,
                        ".text.patch"),
       patchee(p), patcheeOffset(off), instr(instr), isARM(isARM) {
   parent = p->getParent();
diff --git a/lld/ELF/Arch/ARM.cpp b/lld/ELF/Arch/ARM.cpp
index 013e90cde6f995..ecf293602ac99d 100644
--- a/lld/ELF/Arch/ARM.cpp
+++ b/lld/ELF/Arch/ARM.cpp
@@ -1331,7 +1331,7 @@ class elf::ArmCmseSGVeneer {
 };
 
 ArmCmseSGSection::ArmCmseSGSection(Ctx &ctx)
-    : SyntheticSection(llvm::ELF::SHF_ALLOC | llvm::ELF::SHF_EXECINSTR,
+    : SyntheticSection(ctx, llvm::ELF::SHF_ALLOC | llvm::ELF::SHF_EXECINSTR,
                        llvm::ELF::SHT_PROGBITS,
                        /*alignment=*/32, ".gnu.sgstubs"),
       ctx(ctx) {
@@ -1446,10 +1446,11 @@ void ArmCmseSGSection::finalizeContents(Ctx &) {
 // https://developer.arm.com/documentation/ecm0359818/latest
 template <typename ELFT> void elf::writeARMCmseImportLib(Ctx &ctx) {
   StringTableSection *shstrtab =
-      make<StringTableSection>(".shstrtab", /*dynamic=*/false);
+      make<StringTableSection>(ctx, ".shstrtab", /*dynamic=*/false);
   StringTableSection *strtab =
-      make<StringTableSection>(".strtab", /*dynamic=*/false);
-  SymbolTableBaseSection *impSymTab = make<SymbolTableSection<ELFT>>(*strtab);
+      make<StringTableSection>(ctx, ".strtab", /*dynamic=*/false);
+  SymbolTableBaseSection *impSymTab =
+      make<SymbolTableSection<ELFT>>(ctx, *strtab);
 
   SmallVector<std::pair<OutputSection *, SyntheticSection *>, 0> osIsPairs;
   osIsPairs.emplace_back(make<OutputSection>(strtab->name, 0, 0), strtab);
diff --git a/lld/ELF/Arch/RISCV.cpp b/lld/ELF/Arch/RISCV.cpp
index 57cc26b3f0a3ff..351cca025b3864 100644
--- a/lld/ELF/Arch/RISCV.cpp
+++ b/lld/ELF/Arch/RISCV.cpp
@@ -1044,8 +1044,9 @@ namespace {
 // extension.
 class RISCVAttributesSection final : public SyntheticSection {
 public:
-  RISCVAttributesSection()
-      : SyntheticSection(0, SHT_RISCV_ATTRIBUTES, 1, ".riscv.attributes") {}
+  RISCVAttributesSection(Ctx &ctx)
+      : SyntheticSection(ctx, 0, SHT_RISCV_ATTRIBUTES, 1, ".riscv.attributes") {
+  }
 
   size_t getSize(Ctx &) const override { return size; }
   void writeTo(Ctx &, uint8_t *buf) override;
@@ -1179,7 +1180,7 @@ mergeAttributesSection(Ctx &ctx,
   unsigned firstStackAlignValue = 0, xlen = 0;
   bool hasArch = false;
 
-  ctx.in.riscvAttributes = std::make_unique<RISCVAttributesSection>();
+  ctx.in.riscvAttributes = std::make_unique<RISCVAttributesSection>(ctx);
   auto &merged = static_cast<RISCVAttributesSection &>(*ctx.in.riscvAttributes);
 
   // Collect all tags values from attributes section.
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 43f75bab12775f..019388c9bd2e2c 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -2370,7 +2370,7 @@ static void replaceCommonSymbols(Ctx &ctx) {
       if (!s)
         continue;
 
-      auto *bss = make<BssSection>("COMMON", s->size, s->alignment);
+      auto *bss = make<BssSection>(ctx, "COMMON", s->size, s->alignment);
       bss->file = s->file;
       ctx.inputSections.push_back(bss);
       Defined(s->file, StringRef(), s->binding, s->stOther, s->type,
diff --git a/lld/ELF/InputSection.h b/lld/ELF/InputSection.h
index 8f69a957e11d7a..543ff4db3c3270 100644
--- a/lld/ELF/InputSection.h
+++ b/lld/ELF/InputSection.h
@@ -472,7 +472,7 @@ static_assert(sizeof(InputSection) <= 160, "InputSection is too big");
 
 class SyntheticSection : public InputSection {
 public:
-  SyntheticSection(uint64_t flags, uint32_t type, uint32_t addralign,
+  SyntheticSection(Ctx &ctx, uint64_t flags, uint32_t type, uint32_t addralign,
                    StringRef name)
       : InputSection(ctx.internalFile, flags, type, addralign, {}, name,
                      InputSectionBase::Synthetic) {}
diff --git a/lld/ELF/OutputSections.cpp b/lld/ELF/OutputSections.cpp
index 864c30ca790508..3f3b80830d80d5 100644
--- a/lld/ELF/OutputSections.cpp
+++ b/lld/ELF/OutputSections.cpp
@@ -178,8 +178,8 @@ static MergeSyntheticSection *createMergeSynthetic(Ctx &ctx, StringRef name,
                                                    uint64_t flags,
                                                    uint32_t addralign) {
   if ((flags & SHF_STRINGS) && ctx.arg.optimize >= 2)
-    return make<MergeTailSection>(name, type, flags, addralign);
-  return make<MergeNoTailSection>(name, type, flags, addralign);
+    return make<MergeTailSection>(ctx, name, type, flags, addralign);
+  return make<MergeNoTailSection>(ctx, name, type, flags, addralign);
 }
 
 // This function scans over the InputSectionBase list sectionBases to create
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index ba2d493c28213f..5d81d0cccb78e5 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -381,8 +381,8 @@ template <class ELFT> static void addCopyRelSymbol(Ctx &ctx, SharedSymbol &ss) {
   // See if this symbol is in a read-only segment. If so, preserve the symbol's
   // memory protection by reserving space in the .bss.rel.ro section.
   bool isRO = isReadOnly<ELFT>(ss);
-  BssSection *sec =
-      make<BssSection>(isRO ? ".bss.rel.ro" : ".bss", symSize, ss.alignment);
+  BssSection *sec = make<BssSection>(ctx, isRO ? ".bss.rel.ro" : ".bss",
+                                     symSize, ss.alignment);
   OutputSection *osec = (isRO ? ctx.in.bssRelRo : ctx.in.bss)->getParent();
 
   // At this point, sectionBases has been migrated to sections. Append sec to
@@ -2185,7 +2185,7 @@ void ThunkCreator::createInitialThunkSections(
 ThunkSection *ThunkCreator::addThunkSection(OutputSection *os,
                                             InputSectionDescription *isd,
                                             uint64_t off) {
-  auto *ts = make<ThunkSection>(os, off);
+  auto *ts = make<ThunkSection>(ctx, os, off);
   ts->partition = os->partition;
   if ((ctx.arg.fixCortexA53Errata843419 || ctx.arg.fixCortexA8) &&
       !isd->sections.empty()) {
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index 88f0ccf1c4b730..a65c137762ce63 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -96,8 +96,9 @@ MergeInputSection *elf::createCommentSection() {
 
 // .MIPS.abiflags section.
 template <class ELFT>
-MipsAbiFlagsSection<ELFT>::MipsAbiFlagsSection(Elf_Mips_ABIFlags flags)
-    : SyntheticSection(SHF_ALLOC, SHT_MIPS_ABIFLAGS, 8, ".MIPS.abiflags"),
+MipsAbiFlagsSection<ELFT>::MipsAbiFlagsSection(Ctx &ctx,
+                                               Elf_Mips_ABIFlags flags)
+    : SyntheticSection(ctx, SHF_ALLOC, SHT_MIPS_ABIFLAGS, 8, ".MIPS.abiflags"),
       flags(flags) {
   this->entsize = sizeof(Elf_Mips_ABIFlags);
 }
@@ -152,14 +153,14 @@ MipsAbiFlagsSection<ELFT>::create(Ctx &ctx) {
   };
 
   if (create)
-    return std::make_unique<MipsAbiFlagsSection<ELFT>>(flags);
+    return std::make_unique<MipsAbiFlagsSection<ELFT>>(ctx, flags);
   return nullptr;
 }
 
 // .MIPS.options section.
 template <class ELFT>
-MipsOptionsSection<ELFT>::MipsOptionsSection(Elf_Mips_RegInfo reginfo)
-    : SyntheticSection(SHF_ALLOC, SHT_MIPS_OPTIONS, 8, ".MIPS.options"),
+MipsOptionsSection<ELFT>::MipsOptionsSection(Ctx &ctx, Elf_Mips_RegInfo reginfo)
+    : SyntheticSection(ctx, SHF_ALLOC, SHT_MIPS_OPTIONS, 8, ".MIPS.options"),
       reginfo(reginfo) {
   this->entsize = sizeof(Elf_Mips_Options) + sizeof(Elf_Mips_RegInfo);
 }
@@ -216,13 +217,13 @@ MipsOptionsSection<ELFT>::create(Ctx &ctx) {
     }
   };
 
-  return std::make_unique<MipsOptionsSection<ELFT>>(reginfo);
+  return std::make_unique<MipsOptionsSection<ELFT>>(ctx, reginfo);
 }
 
 // MIPS .reginfo section.
 template <class ELFT>
-MipsReginfoSection<ELFT>::MipsReginfoSection(Elf_Mips_RegInfo reginfo)
-    : SyntheticSection(SHF_ALLOC, SHT_MIPS_REGINFO, 4, ".reginfo"),
+MipsReginfoSection<ELFT>::MipsReginfoSection(Ctx &ctx, Elf_Mips_RegInfo reginfo)
+    : SyntheticSection(ctx, SHF_ALLOC, SHT_MIPS_REGINFO, 4, ".reginfo"),
       reginfo(reginfo) {
   this->entsize = sizeof(Elf_Mips_RegInfo);
 }
@@ -263,7 +264,7 @@ MipsReginfoSection<ELFT>::create(Ctx &ctx) {
     sec->getFile<ELFT>()->mipsGp0 = r->ri_gp_value;
   };
 
-  return std::make_unique<MipsReginfoSection<ELFT>>(reginfo);
+  return std::make_unique<MipsReginfoSection<ELFT>>(ctx, reginfo);
 }
 
 InputSection *elf::createInterpSection() {
@@ -319,8 +320,8 @@ static size_t getHashSize() {
 // If the flag is zero (which indicates that the intersection of the feature
 // sets is empty, or some input files didn't have .note.gnu.property sections),
 // we don't create this section.
-GnuPropertySection::GnuPropertySection()
-    : SyntheticSection(llvm::ELF::SHF_ALLOC, llvm::ELF::SHT_NOTE,
+GnuPropertySection::GnuPropertySection(Ctx &ctx)
+    : SyntheticSection(ctx, llvm::ELF::SHF_ALLOC, llvm::ELF::SHT_NOTE,
                        ctx.arg.wordsize, ".note.gnu.property") {}
 
 void GnuPropertySection::writeTo(Ctx &ctx, uint8_t *buf) {
@@ -361,8 +362,8 @@ size_t GnuPropertySection::getSize(Ctx &ctx) const {
   return contentSize + 16;
 }
 
-BuildIdSection::BuildIdSection()
-    : SyntheticSection(SHF_ALLOC, SHT_NOTE, 4, ".note.gnu.build-id"),
+BuildIdSection::BuildIdSection(Ctx &ctx)
+    : SyntheticSection(ctx, SHF_ALLOC, SHT_NOTE, 4, ".note.gnu.build-id"),
       hashSize(getHashSize()) {}
 
 void BuildIdSection::writeTo(Ctx &ctx, uint8_t *buf) {
@@ -378,14 +379,16 @@ void BuildIdSection::writeBuildId(ArrayRef<uint8_t> buf) {
   memcpy(hashBuf, buf.data(), hashSize);
 }
 
-BssSection::BssSection(StringRef name, uint64_t size, uint32_t alignment)
-    : SyntheticSection(SHF_ALLOC | SHF_WRITE, SHT_NOBITS, alignment, name) {
+BssSection::BssSection(Ctx &ctx, StringRef name, uint64_t size,
+                       uint32_t alignment)
+    : SyntheticSection(ctx, SHF_ALLOC | SHF_WRITE, SHT_NOBITS, alignment,
+                       name) {
   this->bss = true;
   this->size = size;
 }
 
-EhFrameSection::EhFrameSection()
-    : SyntheticSection(SHF_ALLOC, SHT_PROGBITS, 1, ".eh_frame") {}
+EhFrameSection::EhFrameSection(Ctx &ctx)
+    : SyntheticSection(ctx, SHF_ALLOC, SHT_PROGBITS, 1, ".eh_frame") {}
 
 // Search for an existing CIE record or create a new one.
 // CIE records from input object files are uniquified by their contents
@@ -653,8 +656,8 @@ void EhFrameSection::writeTo(Ctx &ctx, uint8_t *buf) {
     getPartition().ehFrameHdr->write();
 }
 
-GotSection::GotSection()
-    : SyntheticSection(SHF_ALLOC | SHF_WRITE, SHT_PROGBITS,
+GotSection::GotSection(Ctx &ctx)
+    : SyntheticSection(ctx, SHF_ALLOC | SHF_WRITE, SHT_PROGBITS,
                        ctx.target->gotEntrySize, ".got") {
   numEntries = ctx.target->gotHeaderEntriesNum;
 }
@@ -737,9 +740,9 @@ static uint64_t getMipsPageCount(uint64_t size) {
   return (size + 0xfffe) / 0xffff + 1;
 }
 
-MipsGotSection::MipsGotSection()
-    : SyntheticSection(SHF_ALLOC | SHF_WRITE | SHF_MIPS_GPREL, SHT_PROGBITS, 16,
-                       ".got") {}
+MipsGotSection::MipsGotSection(Ctx &ctx)
+    : SyntheticSection(ctx, SHF_ALLOC | SHF_WRITE | SHF_MIPS_GPREL,
+                       SHT_PROGBITS, 16, ".got") {}
 
 void MipsGotSection::addEntry(InputFile &file, Symbol &sym, int64_t addend,
                               RelExpr expr) {
@@ -1169,9 +1172,9 @@ void MipsGotSection::writeTo(Ctx &ctx, uint8_t *buf) {
 // instead of the .got.plt, and the type is SHT_NOBITS similar to a .bss
 // section. I don't know why we have a BSS style type for the section but it is
 // consistent across both 64-bit PowerPC ABIs as well as the 32-bit PowerPC ABI.
-GotPltSection::GotPltSection()
-    : SyntheticSection(SHF_ALLOC | SHF_WRITE, SHT_PROGBITS, ctx.arg.wordsize,
-                       ".got.plt") {
+GotPltSection::GotPltSection(Ctx &ctx)
+    : SyntheticSection(ctx, SHF_ALLOC | SHF_WRITE, SHT_PROGBITS,
+                       ctx.arg.wordsize, ".got.plt") {
   if (ctx.arg.emachine == EM_PPC) {
     name = ".plt";
   } else if (ctx.arg.emachine == EM_PPC64) {
@@ -1221,8 +1224,8 @@ static StringRef getIgotPltName() {
 
 // On PowerPC64 the GotPltSection type is SHT_NOBITS so we have to follow suit
 // with the IgotPltSection.
-IgotPltSection::IgotPltSection()
-    : SyntheticSection(SHF_ALLOC | SHF_WRITE,
+IgotPltSection::IgotPltSection(Ctx &ctx)
+    : SyntheticSection(ctx, SHF_ALLOC | SHF_WRITE,
                        ctx.arg.emachine == EM_PPC64 ? SHT_NOBITS : SHT_PROGBITS,
                        ctx.target->gotEntrySize, getIgotPltName()) {}
 
@@ -1242,8 +1245,9 @@ void IgotPltSection::writeTo(Ctx &ctx, uint8_t *buf) {
   }
 }
 
-StringTableSection::StringTableSection(StringRef name, bool dynamic)
-    : SyntheticSection(dynamic ? (uint64_t)SHF_ALLOC : 0, SHT_STRTAB, 1, name),
+StringTableSection::StringTableSection(Ctx &ctx, StringRef name, bool dynamic)
+    : SyntheticSection(ctx, dynamic ? (uint64_t)SHF_ALLOC : 0, SHT_STRTAB, 1,
+                       name),
       dynamic(dynamic) {
   // ELF string tables start with a NUL byte.
   strings.push_back("");
@@ -1283,9 +1287,9 @@ void StringTableSection::writeTo(Ctx &ctx, uint8_t *buf) {
 static unsigned getVerDefNum() { return namedVersionDefs(ctx).size() + 1; }
 
 template <class ELFT>
-DynamicSection<ELFT>::DynamicSection()
-    : SyntheticSection(SHF_ALLOC | SHF_WRITE, SHT_DYNAMIC, ctx.arg.wordsize,
-                       ".dynamic") {
+DynamicSection<ELFT>::DynamicSection(Ctx &ctx)
+    : SyntheticSection(ctx, SHF_ALLOC | SHF_WRITE, SHT_DYNAMIC,
+                       ctx.arg.wordsize, ".dynamic") {
   this->entsize = ELFT::Is64Bits ? 16 : 8;
 
   // .dynamic section is not writable on MIPS and on Fuchsia OS
@@ -1632,12 +1636,12 @@ uint32_t DynamicReloc::getSymIndex(SymbolTableBaseSection *symTab) const {
   return index;
 }
 
-RelocationBaseSection::RelocationBaseSection(StringRef name, uint32_t type,
-                                             int32_t dynamicTag,
+RelocationBaseSection::RelocationBaseSection(Ctx &ctx, StringRef name,
+                                             uint32_t type, int32_t dynamicTag,
                                              int32_t sizeDynamicTag,
                                              bool combreloc,
                                              unsigned concurrency)
-    : SyntheticSection(SHF_ALLOC, type, ctx.arg.wordsize, name),
+    : SyntheticSection(ctx, SHF_ALLOC, type, ctx.arg.wordsize, name),
       dynamicTag(dynamicTag), sizeDynamicTag(sizeDynamicTag),
       relocsVec(concurrency), combreloc(combreloc) {}
 
@@ -1728,9 +1732,9 @@ void RelocationBaseSection::computeRels() {
 }
 
 template <class ELFT>
-RelocationSection<ELFT>::RelocationSection(StringRef name, bool combreloc,
-                                           unsigned concurrency)
-    : RelocationBaseSection(name, ctx.arg.isRela ? SHT_RELA : SHT_REL,
+RelocationSection<ELFT>::RelocationSection(Ctx &ctx, StringRef name,
+                                           bool combreloc, unsigned concurrency)
+    : RelocationBaseSection(ctx, name, ctx.arg.isRela ? SHT_RELA : SHT_REL,
                             ctx.arg.isRela ? DT_RELA : DT_REL,
                             ctx.arg.isRela ? DT_RELASZ : DT_RELSZ, combreloc,
                             concurrency) {
@@ -1750,9 +1754,10 @@ void RelocationSection<ELFT>::writeTo(Ctx &ctx, uint8_t *buf) {
   }
 }
 
-RelrBaseSection::RelrBaseSection(unsigned concurrency, bool isAArch64Auth)
+RelrBaseSection::RelrBaseSection(Ctx &ctx, unsigned concurrency,
+                                 bool isAArch64Auth)
     : SyntheticSection(
-          SHF_ALLOC,
+          ctx, SHF_ALLOC,
           isAArch64Auth
               ? SHT_AARCH64_AUTH_RELR
               : (ctx.arg.useAndroidRelrTags ? SHT_ANDROID_RELR : SHT_RELR),
@@ -1771,9 +1776,9 @@ void RelrBaseSection::mergeRels() {
 
 template <class ELFT>
 AndroidPackedRelocationSection<ELFT>::AndroidPackedRelocationSection(
-    StringRef name, unsigned concurrency)
+    Ctx &ctx, StringRef name, unsigned concurrency)
     : RelocationBaseSection(
-          name, ctx.arg.isRela ? SHT_ANDROID_RELA : SHT_ANDROID_REL,
+          ctx, name, ctx.arg.isRela ? SHT_ANDROID_RELA : SHT_ANDROID_REL,
           ctx.arg.isRela ? DT_ANDROID_RELA : DT_ANDROID_REL,
           ctx.arg.isRela ? DT_ANDROID_RELASZ : DT_ANDROID_RELSZ,
           /*combreloc=*/false, concurrency) {
@@ -2024,8 +2029,9 @@ bool AndroidPackedRelocationSection<ELFT>::updateAllocSize(Ctx &ctx) {
 }
 
 template <class ELFT>
-RelrSection<ELFT>::RelrSection(unsigned concurrency, bool isAArch64Auth)
-    : RelrBaseSection(concurrency, isAArch64Auth) {
+RelrSection<ELFT>::RelrSection(Ctx &ctx, unsigned concurrency,
+                               bool isAArch64Auth)
+    : RelrBaseSection(ctx, concurrency, isAArch64Auth) {
   this->entsize = ctx.arg.wordsize;
 }
 
@@ -2110,8 +2116,9 @@ template <class ELFT> bool RelrSection<ELFT>::updateAllocSize(Ctx &ctx) {
   return relrRelocs.size() != oldSize;
 }
 
-SymbolTableBaseSection::SymbolTableBaseSection(StringTableSection &strTabSec)
-    : SyntheticSection(strTabSec.isDynamic() ? (uint64_t)SHF_ALLOC : 0,
+SymbolTableBaseSection::SymbolTableBaseSection(Ctx &ctx,
+                                               StringTableSection &strTabSec)
+    : SyntheticSection(ctx, strTabSec.isDynamic() ? (uint64_t)SHF_ALLOC : 0,
                        strTabSec.isDynamic() ? SHT_DYNSYM : SHT_SYMTAB,
                        ctx.arg.wordsize,
                        strTabSec.isDynamic() ? ".dynsym" : ".symtab"),
@@ -2226,8 +2233,9 @@ size_t SymbolTableBaseSection::getSymbolIndex(const Symbol &sym) {
 }
 
 template <class ELFT>
-SymbolTableSection<ELFT>::SymbolTableSection(StringTableSection &strTabSec)
-    : SymbolTableBaseSection(strTabSec) {
+SymbolTableSection<ELFT>::SymbolTableSection(Ctx &ctx,
+                                             StringTableSection &strTabSec)
+    : SymbolTableBaseSection(ctx, strTabSec) {
   this->entsize = sizeof(Elf_Sym);
 }
 
@@ -2327,8 +2335,8 @@ void SymbolTableSection<ELFT>::writeTo(Ctx &ctx, uint8_t *buf) {
   }
 }
 
-SymtabShndxSection::SymtabShndxSection()
-    : SyntheticSection(0, SHT_SYMTAB_SHNDX, 4, ".symtab_shndx") {
+SymtabShndxSection::SymtabShndxSection(Ctx &ctx)
+    : SyntheticSection(ctx, 0, SHT_SYMTAB_SHNDX, 4, ".symtab_shndx") {
   this->entsize = 4;
 }
 
@@ -2396,9 +2404,9 @@ size_t SymtabShndxSection::getSize(Ctx &ctx) const {
 // DSOs very quickly. If you are sure that your dynamic linker knows
 // about .gnu.hash, you want to specify --hash-style=gnu. Otherwise, a
 // safe bet is to specify --hash-style=both for backward compatibility.
-GnuHashTableSection::GnuHashTableSection()
-    : SyntheticSection(SHF_ALLOC, SHT_GNU_HASH, ctx.arg.wordsize, ".gnu.hash") {
-}
+GnuHashTableSection::GnuHashTableSection(Ctx &ctx)
+    : SyntheticSection(ctx, SHF_ALLOC, SHT_GNU_HASH, ctx.arg.wordsize,
+                       ".gnu.hash") {}
 
 void GnuHashTableSection::finalizeContents(Ctx &) {
   if (OutputSection *sec = getPartition().dynSymTab->getParent())
@@ -2505,8 +2513,8 @@ void GnuHashTableSection::addSymbols(SmallVectorImpl<SymbolTableEntry> &v) {
     v.push_back({ent.sym, ent.strTabOffset});
 }
 
-HashTableSection::HashTableSection()
-    : SyntheticSection(SHF_ALLOC, SHT_HASH, 4, ".hash") {
+HashTableSection::HashTableSection(Ctx &ctx)
+    : SyntheticSection(ctx, SHF_ALLOC, SHT_HASH, 4, ".hash") {
   this->entsize = 4;
 }
 
@@ -2545,8 +2553,9 @@ void HashTableSection::writeTo(Ctx &ctx, uint8_t *buf) {
   }
 }
 
-PltSection::PltSection()
-    : SyntheticSection(SHF_ALLOC | SHF_EXECINSTR, SHT_PROGBITS, 16, ".plt"),
+PltSection::PltSection(Ctx &ctx)
+    : SyntheticSection(ctx, SHF_ALLOC | SHF_EXECINSTR, SHT_PROGBITS, 16,
+                       ".plt"),
       headerSize(ctx.target->pltHeaderSize) {
   // On PowerPC, this section contains lazy symbol resolvers.
   if (ctx.arg.emachine == EM_PPC64) {
@@ -2606,8 +2615,9 @@ void PltSection::addSymbols() {
   }
 }
 
-IpltSection::IpltSection()
-    : SyntheticSection(SHF_ALLOC | SHF_EXECINSTR, SHT_PROGBITS, 16, ".iplt") {
+IpltSection::IpltSection(Ctx &ctx)
+    : SyntheticSection(ctx, SHF_ALLOC | SHF_EXECINSTR, SHT_PROGBITS, 16,
+                       ".iplt") {
   if (ctx.arg.emachine == EM_PPC || ctx.arg.emachine == EM_PPC64) {
     name = ".glink";
     addralign = 4;
@@ -2641,7 +2651,7 @@ void IpltSection::addSymbols() {
   }
 }
 
-PPC32GlinkSection::PPC32GlinkSection() {
+PPC32GlinkSection::PPC32GlinkSection(Ctx &ctx) : PltSection(ctx) {
   name = ".glink";
   addralign = 4;
 }
@@ -2712,8 +2722,9 @@ size_t PPC32GlinkSection::getSize(Ctx &ctx) const {
 //
 // That said, the 2-PLT scheme is a part of the ABI, debuggers and other tools
 // depend on it, so we implement the ABI.
-IBTPltSection::IBTPltSection()
-    : SyntheticSection(SHF_ALLOC | SHF_EXECINSTR, SHT_PROGBITS, 16, ".plt") {}
+IBTPltSection::IBTPltSection(Ctx &ctx)
+    : SyntheticSection(ctx, SHF_ALLOC | SHF_EXECINSTR, SHT_PROGBITS, 16,
+                       ".plt") {}
 
 void IBTPltSection::writeTo(Ctx &ctx, uint8_t *buf) {
   ctx.target->writeIBTPlt(buf, ctx.in.plt->getNumEntries());
@@ -2728,9 +2739,9 @@ bool IBTPltSection::isNeeded(Ctx &ctx) const {
   return ctx.in.plt->getNumEntries() > 0;
 }
 
-RelroPaddingSection::RelroPaddingSection()
-    : SyntheticSection(SHF_ALLOC | SHF_WRITE, SHT_NOBITS, 1, ".relro_padding") {
-}
+RelroPaddingSection::RelroPaddingSection(Ctx &ctx)
+    : SyntheticSection(ctx, SHF_ALLOC | SHF_WRITE, SHT_NOBITS, 1,
+                       ".relro_padding") {}
 
 // The string hash function for .gdb_index.
 static uint32_t computeGdbHash(StringRef s) {
@@ -2742,8 +2753,8 @@ static uint32_t computeGdbHash(StringRef s) {
 
 // 4-byte alignment ensures that values in the hash lookup table and the name
 // table are aligned.
-DebugNamesBaseSection::DebugNamesBaseSection()
-    : SyntheticSection(0, SHT_PROGBITS, 4, ".debug_names") {}
+DebugNamesBaseSection::DebugNamesBaseSection(Ctx &ctx)
+    : SyntheticSection(ctx, 0, SHT_PROGBITS, 4, ".debug_names") {}
 
 // Get the size of the .debug_names section header in bytes for DWARF32:
 static uint32_t getDebugNamesHeaderSize(uint32_t augmentationStringSize) {
@@ -3173,7 +3184,9 @@ void DebugNamesBaseSection::init(
   hdr.UnitLength = size - 4;
 }
 
-template <class ELFT> DebugNamesSection<ELFT>::DebugNamesSection() {
+template <class ELFT>
+DebugNamesSection<ELFT>::DebugNamesSection(Ctx &ctx)
+    : DebugNamesBaseSection(ctx) {
   init([](InputFile *f, InputChunk &inputChunk, OutputChunk &chunk) {
     auto *file = cast<ObjFile<ELFT>>(f);
     DWARFContext dwarf(std::make_unique<LLDDwarfObj<ELFT>>(file));
@@ -3337,8 +3350,8 @@ void DebugNamesSection<ELFT>::writeTo(Ctx &ctx, uint8_t *buf) {
   assert(uint64_t(buf - beginBuf) == size);
 }
 
-GdbIndexSection::GdbIndexSection()
-    : SyntheticSection(0, SHT_PROGBITS, 1, ".gdb_index") {}
+GdbIndexSection::GdbIndexSection(Ctx &ctx)
+    : SyntheticSection(ctx, 0, SHT_PROGBITS, 1, ".gdb_index") {}
 
 // Returns the desired size of an on-disk hash table for a .gdb_index section.
 // There's a tradeoff between size and collision rate. We aim 75% utilization.
@@ -3500,7 +3513,7 @@ createSymbols(
 
 // Returns a newly-created .gdb_index section.
 template <class ELFT>
-std::unique_ptr<GdbIndexSection> GdbIndexSection::create() {
+std::unique_ptr<GdbIndexSection> GdbIndexSection::create(Ctx &) {
   llvm::TimeTraceScope timeScope("Create gdb index");
 
   // Collect InputFiles with .debug_info. See the comment in
@@ -3546,7 +3559,7 @@ std::unique_ptr<GdbIndexSection> GdbIndexSection::create() {
     nameAttrs[i] = readPubNamesAndTypes<ELFT>(dobj, chunks[i].compilationUnits);
   });
 
-  auto ret = std::make_unique<GdbIndexSection>();
+  auto ret = std::make_unique<GdbIndexSection>(ctx);
   ret->chunks = std::move(chunks);
   std::tie(ret->symbols, ret->size) = createSymbols(nameAttrs, ret->chunks);
 
@@ -3630,8 +3643,8 @@ void GdbIndexSection::writeTo(Ctx &ctx, uint8_t *buf) {
 
 bool GdbIndexSection::isNeeded(Ctx &) const { return !chunks.empty(); }
 
-EhFrameHeader::EhFrameHeader()
-    : SyntheticSection(SHF_ALLOC, SHT_PROGBITS, 4, ".eh_frame_hdr") {}
+EhFrameHeader::EhFrameHeader(Ctx &ctx)
+    : SyntheticSection(ctx, SHF_ALLOC, SHT_PROGBITS, 4, ".eh_frame_hdr") {}
 
 void EhFrameHeader::writeTo(Ctx &ctx, uint8_t *buf) {
   // Unlike most sections, the EhFrameHeader section is written while writing
@@ -3675,8 +3688,8 @@ bool EhFrameHeader::isNeeded(Ctx &ctx) const {
   return isLive() && getPartition().ehFrame->isNeeded(ctx);
 }
 
-VersionDefinitionSection::VersionDefinitionSection()
-    : SyntheticSection(SHF_ALLOC, SHT_GNU_verdef, sizeof(uint32_t),
+VersionDefinitionSection::VersionDefinitionSection(Ctx &ctx)
+    : SyntheticSection(ctx, SHF_ALLOC, SHT_GNU_verdef, sizeof(uint32_t),
                        ".gnu.version_d") {}
 
 StringRef VersionDefinitionSection::getFileDefName() {
@@ -3737,8 +3750,8 @@ size_t VersionDefinitionSection::getSize(Ctx &ctx) const {
 }
 
 // .gnu.version is a table where each entry is 2 byte long.
-VersionTableSection::VersionTableSection()
-    : SyntheticSection(SHF_ALLOC, SHT_GNU_versym, sizeof(uint16_t),
+VersionTableSection::VersionTableSection(Ctx &ctx)
+    : SyntheticSection(ctx, SHF_ALLOC, SHT_GNU_versym, sizeof(uint16_t),
                        ".gnu.version") {
   this->entsize = 2;
 }
@@ -3788,8 +3801,8 @@ void elf::addVerneed(Symbol *ss) {
 }
 
 template <class ELFT>
-VersionNeedSection<ELFT>::VersionNeedSection()
-    : SyntheticSection(SHF_ALLOC, SHT_GNU_verneed, sizeof(uint32_t),
+VersionNeedSection<ELFT>::VersionNeedSection(Ctx &ctx)
+    : SyntheticSection(ctx, SHF_ALLOC, SHT_GNU_verneed, sizeof(uint32_t),
                        ".gnu.version_r") {}
 
 template <class ELFT> void VersionNeedSection<ELFT>::finalizeContents(Ctx &) {
@@ -3872,9 +3885,9 @@ void MergeSyntheticSection::addSection(MergeInputSection *ms) {
   addralign = std::max(addralign, ms->addralign);
 }
 
-MergeTailSection::MergeTailSection(StringRef name, uint32_t type,
+MergeTailSection::MergeTailSection(Ctx &ctx, StringRef name, uint32_t type,
                                    uint64_t flags, uint32_t alignment)
-    : MergeSyntheticSection(name, type, flags, alignment),
+    : MergeSyntheticSection(ctx, name, type, flags, alignment),
       builder(StringTableBuilder::RAW, llvm::Align(alignment)) {}
 
 size_t MergeTailSection::getSize(Ctx &) const { return builder.getSize(); }
@@ -3997,12 +4010,12 @@ void elf::combineEhSections(Ctx &ctx) {
   });
 }
 
-MipsRldMapSection::MipsRldMapSection()
-    : SyntheticSection(SHF_ALLOC | SHF_WRITE, SHT_PROGBITS, ctx.arg.wordsize,
-                       ".rld_map") {}
+MipsRldMapSection::MipsRldMapSection(Ctx &ctx)
+    : SyntheticSection(ctx, SHF_ALLOC | SHF_WRITE, SHT_PROGBITS,
+                       ctx.arg.wordsize, ".rld_map") {}
 
-ARMExidxSyntheticSection::ARMExidxSyntheticSection()
-    : SyntheticSection(SHF_ALLOC | SHF_LINK_ORDER, SHT_ARM_EXIDX,
+ARMExidxSyntheticSection::ARMExidxSyntheticSection(Ctx &ctx)
+    : SyntheticSection(ctx, SHF_ALLOC | SHF_LINK_ORDER, SHT_ARM_EXIDX,
                        ctx.arg.wordsize, ".ARM.exidx") {}
 
 static InputSection *findExidxSection(InputSection *isec) {
@@ -4225,8 +4238,8 @@ bool ARMExidxSyntheticSection::isNeeded(Ctx &) const {
                       [](InputSection *isec) { return isec->isLive(); });
 }
 
-ThunkSection::ThunkSection(OutputSection *os, uint64_t off)
-    : SyntheticSection(SHF_ALLOC | SHF_EXECINSTR, SHT_PROGBITS,
+ThunkSection::ThunkSection(Ctx &ctx, OutputSection *os, uint64_t off)
+    : SyntheticSection(ctx, SHF_ALLOC | SHF_EXECINSTR, SHT_PROGBITS,
                        ctx.arg.emachine == EM_PPC64 ? 16 : 4, ".text.thunk") {
   this->parent = os;
   this->outSecOff = off;
@@ -4269,8 +4282,8 @@ bool ThunkSection::assignOffsets() {
   return changed;
 }
 
-PPC32Got2Section::PPC32Got2Section()
-    : SyntheticSection(SHF_ALLOC | SHF_WRITE, SHT_PROGBITS, 4, ".got2") {}
+PPC32Got2Section::PPC32Got2Section(Ctx &ctx)
+    : SyntheticSection(ctx, SHF_ALLOC | SHF_WRITE, SHT_PROGBITS, 4, ".got2") {}
 
 bool PPC32Got2Section::isNeeded(Ctx &) const {
   // See the comment below. This is not needed if there is no other
@@ -4302,8 +4315,8 @@ void PPC32Got2Section::finalizeContents(Ctx &) {
 // directly in the binary so the section has type SHT_PROGBITS. If linking
 // position-independent code the section has type SHT_NOBITS since it will be
 // allocated and filled in by the dynamic linker.
-PPC64LongBranchTargetSection::PPC64LongBranchTargetSection()
-    : SyntheticSection(SHF_ALLOC | SHF_WRITE,
+PPC64LongBranchTargetSection::PPC64LongBranchTargetSection(Ctx &ctx)
+    : SyntheticSection(ctx, SHF_ALLOC | SHF_WRITE,
                        ctx.arg.isPic ? SHT_NOBITS : SHT_PROGBITS, 8,
                        ".branch_lt") {}
 
@@ -4415,8 +4428,8 @@ template <typename ELFT> void elf::writePhdrs(uint8_t *buf, Partition &part) {
 }
 
 template <typename ELFT>
-PartitionElfHeaderSection<ELFT>::PartitionElfHeaderSection()
-    : SyntheticSection(SHF_ALLOC, SHT_LLVM_PART_EHDR, 1, "") {}
+PartitionElfHeaderSection<ELFT>::PartitionElfHeaderSection(Ctx &ctx)
+    : SyntheticSection(ctx, SHF_ALLOC, SHT_LLVM_PART_EHDR, 1, "") {}
 
 template <typename ELFT>
 size_t PartitionElfHeaderSection<ELFT>::getSize(Ctx &ctx) const {
@@ -4433,8 +4446,8 @@ void PartitionElfHeaderSection<ELFT>::writeTo(Ctx &ctx, uint8_t *buf) {
 }
 
 template <typename ELFT>
-PartitionProgramHeadersSection<ELFT>::PartitionProgramHeadersSection()
-    : SyntheticSection(SHF_ALLOC, SHT_LLVM_PART_PHDR, 1, ".phdrs") {}
+PartitionProgramHeadersSection<ELFT>::PartitionProgramHeadersSection(Ctx &ctx)
+    : SyntheticSection(ctx, SHF_ALLOC, SHT_LLVM_PART_PHDR, 1, ".phdrs") {}
 
 template <typename ELFT>
 size_t PartitionProgramHeadersSection<ELFT>::getSize(Ctx &ctx) const {
@@ -4446,8 +4459,8 @@ void PartitionProgramHeadersSection<ELFT>::writeTo(Ctx &ctx, uint8_t *buf) {
   writePhdrs<ELFT>(buf, getPartition());
 }
 
-PartitionIndexSection::PartitionIndexSection()
-    : SyntheticSection(SHF_ALLOC, SHT_PROGBITS, 4, ".rodata") {}
+PartitionIndexSection::PartitionIndexSection(Ctx &ctx)
+    : SyntheticSection(ctx, SHF_ALLOC, SHT_PROGBITS, 4, ".rodata") {}
 
 size_t PartitionIndexSection::getSize(Ctx &ctx) const {
   return 12 * (ctx.partitions.size() - 1);
@@ -4680,18 +4693,20 @@ template <class ELFT> void elf::createSyntheticSections(Ctx &ctx) {
   auto add = [&](SyntheticSection &sec) { ctx.inputSections.push_back(&sec); };
 
   if (ctx.arg.zSectionHeader)
-    ctx.in.shStrTab = std::make_unique<StringTableSection>(".shstrtab", false);
+    ctx.in.shStrTab =
+        std::make_unique<StringTableSection>(ctx, ".shstrtab", false);
 
   ctx.out.programHeaders = make<OutputSection>("", 0, SHF_ALLOC);
   ctx.out.programHeaders->addralign = ctx.arg.wordsize;
 
   if (ctx.arg.strip != StripPolicy::All) {
-    ctx.in.strTab = std::make_unique<StringTableSection>(".strtab", false);
-    ctx.in.symTab = std::make_unique<SymbolTableSection<ELFT>>(*ctx.in.strTab);
-    ctx.in.symTabShndx = std::make_unique<SymtabShndxSection>();
+    ctx.in.strTab = std::make_unique<StringTableSection>(ctx, ".strtab", false);
+    ctx.in.symTab =
+        std::make_unique<SymbolTableSection<ELFT>>(ctx, *ctx.in.strTab);
+    ctx.in.symTabShndx = std::make_unique<SymtabShndxSection>(ctx);
   }
 
-  ctx.in.bss = std::make_unique<BssSection>(".bss", 0, 1);
+  ctx.in.bss = std::make_unique<BssSection>(ctx, ".bss", 0, 1);
   add(*ctx.in.bss);
 
   // If there is a SECTIONS command and a .data.rel.ro section name use name
@@ -4700,13 +4715,13 @@ template <class ELFT> void elf::createSyntheticSections(Ctx &ctx) {
   bool hasDataRelRo =
       ctx.script->hasSectionsCommand && findSection(".data.rel.ro");
   ctx.in.bssRelRo = std::make_unique<BssSection>(
-      hasDataRelRo ? ".data.rel.ro.bss" : ".bss.rel.ro", 0, 1);
+      ctx, hasDataRelRo ? ".data.rel.ro.bss" : ".bss.rel.ro", 0, 1);
   add(*ctx.in.bssRelRo);
 
   // Add MIPS-specific sections.
   if (ctx.arg.emachine == EM_MIPS) {
     if (!ctx.arg.shared && ctx.arg.hasDynSymTab) {
-      ctx.in.mipsRldMap = std::make_unique<MipsRldMapSection>();
+      ctx.in.mipsRldMap = std::make_unique<MipsRldMapSection>(ctx);
       add(*ctx.in.mipsRldMap);
     }
     if ((ctx.in.mipsAbiFlags = MipsAbiFlagsSection<ELFT>::create(ctx)))
@@ -4727,68 +4742,68 @@ template <class ELFT> void elf::createSyntheticSections(Ctx &ctx) {
     };
 
     if (!part.name.empty()) {
-      part.elfHeader = std::make_unique<PartitionElfHeaderSection<ELFT>>();
+      part.elfHeader = std::make_unique<PartitionElfHeaderSection<ELFT>>(ctx);
       part.elfHeader->name = part.name;
       add(*part.elfHeader);
 
       part.programHeaders =
-          std::make_unique<PartitionProgramHeadersSection<ELFT>>();
+          std::make_unique<PartitionProgramHeadersSection<ELFT>>(ctx);
       add(*part.programHeaders);
     }
 
     if (ctx.arg.buildId != BuildIdKind::None) {
-      part.buildId = std::make_unique<BuildIdSection>();
+      part.buildId = std::make_unique<BuildIdSection>(ctx);
       add(*part.buildId);
     }
 
     // dynSymTab is always present to simplify sym->includeInDynsym() in
     // finalizeSections.
-    part.dynStrTab = std::make_unique<StringTableSection>(".dynstr", true);
+    part.dynStrTab = std::make_unique<StringTableSection>(ctx, ".dynstr", true);
     part.dynSymTab =
-        std::make_unique<SymbolTableSection<ELFT>>(*part.dynStrTab);
+        std::make_unique<SymbolTableSection<ELFT>>(ctx, *part.dynStrTab);
 
     if (ctx.arg.relocatable)
       continue;
-    part.dynamic = std::make_unique<DynamicSection<ELFT>>();
+    part.dynamic = std::make_unique<DynamicSection<ELFT>>(ctx);
 
     if (hasMemtag()) {
-      part.memtagAndroidNote = std::make_unique<MemtagAndroidNote>();
+      part.memtagAndroidNote = std::make_unique<MemtagAndroidNote>(ctx);
       add(*part.memtagAndroidNote);
       if (canHaveMemtagGlobals()) {
         part.memtagGlobalDescriptors =
-            std::make_unique<MemtagGlobalDescriptors>();
+            std::make_unique<MemtagGlobalDescriptors>(ctx);
         add(*part.memtagGlobalDescriptors);
       }
     }
 
     if (ctx.arg.androidPackDynRelocs)
       part.relaDyn = std::make_unique<AndroidPackedRelocationSection<ELFT>>(
-          relaDynName, threadCount);
+          ctx, relaDynName, threadCount);
     else
       part.relaDyn = std::make_unique<RelocationSection<ELFT>>(
-          relaDynName, ctx.arg.zCombreloc, threadCount);
+          ctx, relaDynName, ctx.arg.zCombreloc, threadCount);
 
     if (ctx.arg.hasDynSymTab) {
       add(*part.dynSymTab);
 
-      part.verSym = std::make_unique<VersionTableSection>();
+      part.verSym = std::make_unique<VersionTableSection>(ctx);
       add(*part.verSym);
 
       if (!namedVersionDefs(ctx).empty()) {
-        part.verDef = std::make_unique<VersionDefinitionSection>();
+        part.verDef = std::make_unique<VersionDefinitionSection>(ctx);
         add(*part.verDef);
       }
 
-      part.verNeed = std::make_unique<VersionNeedSection<ELFT>>();
+      part.verNeed = std::make_unique<VersionNeedSection<ELFT>>(ctx);
       add(*part.verNeed);
 
       if (ctx.arg.gnuHash) {
-        part.gnuHashTab = std::make_unique<GnuHashTableSection>();
+        part.gnuHashTab = std::make_unique<GnuHashTableSection>(ctx);
         add(*part.gnuHashTab);
       }
 
       if (ctx.arg.sysvHash) {
-        part.hashTab = std::make_unique<HashTableSection>();
+        part.hashTab = std::make_unique<HashTableSection>(ctx);
         add(*part.hashTab);
       }
 
@@ -4798,28 +4813,28 @@ template <class ELFT> void elf::createSyntheticSections(Ctx &ctx) {
     add(*part.relaDyn);
 
     if (ctx.arg.relrPackDynRelocs) {
-      part.relrDyn = std::make_unique<RelrSection<ELFT>>(threadCount);
+      part.relrDyn = std::make_unique<RelrSection<ELFT>>(ctx, threadCount);
       add(*part.relrDyn);
       part.relrAuthDyn = std::make_unique<RelrSection<ELFT>>(
-          threadCount, /*isAArch64Auth=*/true);
+          ctx, threadCount, /*isAArch64Auth=*/true);
       add(*part.relrAuthDyn);
     }
 
     if (ctx.arg.ehFrameHdr) {
-      part.ehFrameHdr = std::make_unique<EhFrameHeader>();
+      part.ehFrameHdr = std::make_unique<EhFrameHeader>(ctx);
       add(*part.ehFrameHdr);
     }
-    part.ehFrame = std::make_unique<EhFrameSection>();
+    part.ehFrame = std::make_unique<EhFrameSection>(ctx);
     add(*part.ehFrame);
 
     if (ctx.arg.emachine == EM_ARM) {
       // This section replaces all the individual .ARM.exidx InputSections.
-      part.armExidx = std::make_unique<ARMExidxSyntheticSection>();
+      part.armExidx = std::make_unique<ARMExidxSyntheticSection>(ctx);
       add(*part.armExidx);
     }
 
     if (!ctx.arg.packageMetadata.empty()) {
-      part.packageMetadataNote = std::make_unique<PackageMetadataNote>();
+      part.packageMetadataNote = std::make_unique<PackageMetadataNote>(ctx);
       add(*part.packageMetadataNote);
     }
   }
@@ -4829,11 +4844,11 @@ template <class ELFT> void elf::createSyntheticSections(Ctx &ctx) {
     // so that it is sorted after all other partitions. It also has other
     // special handling (see createPhdrs() and combineEhSections()).
     ctx.in.partEnd =
-        std::make_unique<BssSection>(".part.end", ctx.arg.maxPageSize, 1);
+        std::make_unique<BssSection>(ctx, ".part.end", ctx.arg.maxPageSize, 1);
     ctx.in.partEnd->partition = 255;
     add(*ctx.in.partEnd);
 
-    ctx.in.partIndex = std::make_unique<PartitionIndexSection>();
+    ctx.in.partIndex = std::make_unique<PartitionIndexSection>(ctx);
     addOptionalRegular("__part_index_begin", ctx.in.partIndex.get(), 0);
     addOptionalRegular("__part_index_end", ctx.in.partIndex.get(),
                        ctx.in.partIndex->getSize(ctx));
@@ -4843,34 +4858,34 @@ template <class ELFT> void elf::createSyntheticSections(Ctx &ctx) {
   // Add .got. MIPS' .got is so different from the other archs,
   // it has its own class.
   if (ctx.arg.emachine == EM_MIPS) {
-    ctx.in.mipsGot = std::make_unique<MipsGotSection>();
+    ctx.in.mipsGot = std::make_unique<MipsGotSection>(ctx);
     add(*ctx.in.mipsGot);
   } else {
-    ctx.in.got = std::make_unique<GotSection>();
+    ctx.in.got = std::make_unique<GotSection>(ctx);
     add(*ctx.in.got);
   }
 
   if (ctx.arg.emachine == EM_PPC) {
-    ctx.in.ppc32Got2 = std::make_unique<PPC32Got2Section>();
+    ctx.in.ppc32Got2 = std::make_unique<PPC32Got2Section>(ctx);
     add(*ctx.in.ppc32Got2);
   }
 
   if (ctx.arg.emachine == EM_PPC64) {
     ctx.in.ppc64LongBranchTarget =
-        std::make_unique<PPC64LongBranchTargetSection>();
+        std::make_unique<PPC64LongBranchTargetSection>(ctx);
     add(*ctx.in.ppc64LongBranchTarget);
   }
 
-  ctx.in.gotPlt = std::make_unique<GotPltSection>();
+  ctx.in.gotPlt = std::make_unique<GotPltSection>(ctx);
   add(*ctx.in.gotPlt);
-  ctx.in.igotPlt = std::make_unique<IgotPltSection>();
+  ctx.in.igotPlt = std::make_unique<IgotPltSection>(ctx);
   add(*ctx.in.igotPlt);
   // Add .relro_padding if DATA_SEGMENT_RELRO_END is used; otherwise, add the
   // section in the absence of PHDRS/SECTIONS commands.
   if (ctx.arg.zRelro &&
       ((ctx.script->phdrsCommands.empty() && !ctx.script->hasSectionsCommand) ||
        ctx.script->seenRelroEnd)) {
-    ctx.in.relroPadding = std::make_unique<RelroPaddingSection>();
+    ctx.in.relroPadding = std::make_unique<RelroPaddingSection>(ctx);
     add(*ctx.in.relroPadding);
   }
 
@@ -4891,34 +4906,34 @@ template <class ELFT> void elf::createSyntheticSections(Ctx &ctx) {
   // We always need to add rel[a].plt to output if it has entries.
   // Even for static linking it can contain R_[*]_IRELATIVE relocations.
   ctx.in.relaPlt = std::make_unique<RelocationSection<ELFT>>(
-      ctx.arg.isRela ? ".rela.plt" : ".rel.plt", /*sort=*/false,
+      ctx, ctx.arg.isRela ? ".rela.plt" : ".rel.plt", /*sort=*/false,
       /*threadCount=*/1);
   add(*ctx.in.relaPlt);
 
   if ((ctx.arg.emachine == EM_386 || ctx.arg.emachine == EM_X86_64) &&
       (ctx.arg.andFeatures & GNU_PROPERTY_X86_FEATURE_1_IBT)) {
-    ctx.in.ibtPlt = std::make_unique<IBTPltSection>();
+    ctx.in.ibtPlt = std::make_unique<IBTPltSection>(ctx);
     add(*ctx.in.ibtPlt);
   }
 
   if (ctx.arg.emachine == EM_PPC)
-    ctx.in.plt = std::make_unique<PPC32GlinkSection>();
+    ctx.in.plt = std::make_unique<PPC32GlinkSection>(ctx);
   else
-    ctx.in.plt = std::make_unique<PltSection>();
+    ctx.in.plt = std::make_unique<PltSection>(ctx);
   add(*ctx.in.plt);
-  ctx.in.iplt = std::make_unique<IpltSection>();
+  ctx.in.iplt = std::make_unique<IpltSection>(ctx);
   add(*ctx.in.iplt);
 
   if (ctx.arg.andFeatures || !ctx.aarch64PauthAbiCoreInfo.empty())
-    add(*make<GnuPropertySection>());
+    add(*make<GnuPropertySection>(ctx));
 
   if (ctx.arg.debugNames) {
-    ctx.in.debugNames = std::make_unique<DebugNamesSection<ELFT>>();
+    ctx.in.debugNames = std::make_unique<DebugNamesSection<ELFT>>(ctx);
     add(*ctx.in.debugNames);
   }
 
   if (ctx.arg.gdbIndex) {
-    ctx.in.gdbIndex = GdbIndexSection::create<ELFT>();
+    ctx.in.gdbIndex = GdbIndexSection::create<ELFT>(ctx);
     add(*ctx.in.gdbIndex);
   }
 
@@ -4928,7 +4943,7 @@ template <class ELFT> void elf::createSyntheticSections(Ctx &ctx) {
   // is irrelevant these days. Stack area should always be non-executable
   // by default. So we emit this section unconditionally.
   if (ctx.arg.relocatable)
-    add(*make<GnuStackSection>());
+    add(*make<GnuStackSection>(ctx));
 
   if (ctx.in.symTab)
     add(*ctx.in.symTab);
diff --git a/lld/ELF/SyntheticSections.h b/lld/ELF/SyntheticSections.h
index a40e091500545a..b89860dd21371a 100644
--- a/lld/ELF/SyntheticSections.h
+++ b/lld/ELF/SyntheticSections.h
@@ -50,7 +50,7 @@ struct CieRecord {
 // Section for .eh_frame.
 class EhFrameSection final : public SyntheticSection {
 public:
-  EhFrameSection();
+  EhFrameSection(Ctx &);
   void writeTo(Ctx &, uint8_t *buf) override;
   void finalizeContents(Ctx &) override;
   bool isNeeded(Ctx &) const override { return !sections.empty(); }
@@ -104,7 +104,7 @@ class EhFrameSection final : public SyntheticSection {
 
 class GotSection final : public SyntheticSection {
 public:
-  GotSection();
+  GotSection(Ctx &);
   size_t getSize(Ctx &ctx) const override { return size; }
   void finalizeContents(Ctx &) override;
   bool isNeeded(Ctx &) const override;
@@ -136,15 +136,16 @@ class GotSection final : public SyntheticSection {
 // .note.GNU-stack section.
 class GnuStackSection : public SyntheticSection {
 public:
-  GnuStackSection()
-      : SyntheticSection(0, llvm::ELF::SHT_PROGBITS, 1, ".note.GNU-stack") {}
+  GnuStackSection(Ctx &ctx)
+      : SyntheticSection(ctx, 0, llvm::ELF::SHT_PROGBITS, 1,
+                         ".note.GNU-stack") {}
   void writeTo(Ctx &, uint8_t *buf) override {}
   size_t getSize(Ctx &ctx) const override { return 0; }
 };
 
 class GnuPropertySection final : public SyntheticSection {
 public:
-  GnuPropertySection();
+  GnuPropertySection(Ctx &);
   void writeTo(Ctx &, uint8_t *buf) override;
   size_t getSize(Ctx &) const override;
 };
@@ -156,7 +157,7 @@ class BuildIdSection : public SyntheticSection {
 
 public:
   const size_t hashSize;
-  BuildIdSection();
+  BuildIdSection(Ctx &);
   void writeTo(Ctx &, uint8_t *buf) override;
   size_t getSize(Ctx &ctx) const override { return headerSize + hashSize; }
   void writeBuildId(llvm::ArrayRef<uint8_t> buf);
@@ -171,7 +172,7 @@ class BuildIdSection : public SyntheticSection {
 // respectively.
 class BssSection final : public SyntheticSection {
 public:
-  BssSection(StringRef name, uint64_t size, uint32_t addralign);
+  BssSection(Ctx &, StringRef name, uint64_t size, uint32_t addralign);
   void writeTo(Ctx &, uint8_t *) override {}
   bool isNeeded(Ctx &) const override { return size != 0; }
   size_t getSize(Ctx &ctx) const override { return size; }
@@ -182,7 +183,7 @@ class BssSection final : public SyntheticSection {
 
 class MipsGotSection final : public SyntheticSection {
 public:
-  MipsGotSection();
+  MipsGotSection(Ctx &);
   void writeTo(Ctx &, uint8_t *buf) override;
   size_t getSize(Ctx &ctx) const override { return size; }
   bool updateAllocSize(Ctx &) override;
@@ -359,7 +360,7 @@ class MipsGotSection final : public SyntheticSection {
 
 class GotPltSection final : public SyntheticSection {
 public:
-  GotPltSection();
+  GotPltSection(Ctx &);
   void addEntry(Symbol &sym);
   size_t getSize(Ctx &) const override;
   void writeTo(Ctx &, uint8_t *buf) override;
@@ -379,7 +380,7 @@ class GotPltSection final : public SyntheticSection {
 // on ARM the IgotPltSection will immediately follow the GotSection.
 class IgotPltSection final : public SyntheticSection {
 public:
-  IgotPltSection();
+  IgotPltSection(Ctx &);
   void addEntry(Symbol &sym);
   size_t getSize(Ctx &) const override;
   void writeTo(Ctx &, uint8_t *buf) override;
@@ -391,7 +392,7 @@ class IgotPltSection final : public SyntheticSection {
 
 class StringTableSection final : public SyntheticSection {
 public:
-  StringTableSection(StringRef name, bool dynamic);
+  StringTableSection(Ctx &, StringRef name, bool dynamic);
   unsigned addString(StringRef s, bool hashIt = true);
   void writeTo(Ctx &, uint8_t *buf) override;
   size_t getSize(Ctx &ctx) const override { return size; }
@@ -483,7 +484,7 @@ template <class ELFT> class DynamicSection final : public SyntheticSection {
   LLVM_ELF_IMPORT_TYPES_ELFT(ELFT)
 
 public:
-  DynamicSection();
+  DynamicSection(Ctx &);
   void finalizeContents(Ctx &) override;
   void writeTo(Ctx &, uint8_t *buf) override;
   size_t getSize(Ctx &ctx) const override { return size; }
@@ -495,9 +496,9 @@ template <class ELFT> class DynamicSection final : public SyntheticSection {
 
 class RelocationBaseSection : public SyntheticSection {
 public:
-  RelocationBaseSection(StringRef name, uint32_t type, int32_t dynamicTag,
-                        int32_t sizeDynamicTag, bool combreloc,
-                        unsigned concurrency);
+  RelocationBaseSection(Ctx &, StringRef name, uint32_t type,
+                        int32_t dynamicTag, int32_t sizeDynamicTag,
+                        bool combreloc, unsigned concurrency);
   /// Add a dynamic relocation without writing an addend to the output section.
   /// This overload can be used if the addends are written directly instead of
   /// using relocations on the input section (e.g. MipsGotSection::writeTo()).
@@ -578,7 +579,8 @@ class RelocationSection final : public RelocationBaseSection {
   using Elf_Rela = typename ELFT::Rela;
 
 public:
-  RelocationSection(StringRef name, bool combreloc, unsigned concurrency);
+  RelocationSection(Ctx &, StringRef name, bool combreloc,
+                    unsigned concurrency);
   void writeTo(Ctx &, uint8_t *buf) override;
 };
 
@@ -588,7 +590,7 @@ class AndroidPackedRelocationSection final : public RelocationBaseSection {
   using Elf_Rela = typename ELFT::Rela;
 
 public:
-  AndroidPackedRelocationSection(StringRef name, unsigned concurrency);
+  AndroidPackedRelocationSection(Ctx &, StringRef name, unsigned concurrency);
 
   bool updateAllocSize(Ctx &) override;
   size_t getSize(Ctx &ctx) const override { return relocData.size(); }
@@ -611,7 +613,7 @@ struct RelativeReloc {
 
 class RelrBaseSection : public SyntheticSection {
 public:
-  RelrBaseSection(unsigned concurrency, bool isAArch64Auth = false);
+  RelrBaseSection(Ctx &, unsigned concurrency, bool isAArch64Auth = false);
   void mergeRels();
   bool isNeeded(Ctx &) const override {
     return !relocs.empty() ||
@@ -629,7 +631,7 @@ template <class ELFT> class RelrSection final : public RelrBaseSection {
   using Elf_Relr = typename ELFT::Relr;
 
 public:
-  RelrSection(unsigned concurrency, bool isAArch64Auth = false);
+  RelrSection(Ctx &, unsigned concurrency, bool isAArch64Auth = false);
 
   bool updateAllocSize(Ctx &) override;
   size_t getSize(Ctx &ctx) const override {
@@ -650,7 +652,7 @@ struct SymbolTableEntry {
 
 class SymbolTableBaseSection : public SyntheticSection {
 public:
-  SymbolTableBaseSection(StringTableSection &strTabSec);
+  SymbolTableBaseSection(Ctx &ctx, StringTableSection &strTabSec);
   void finalizeContents(Ctx &) override;
   size_t getSize(Ctx &ctx) const override { return getNumSymbols() * entsize; }
   void addSymbol(Symbol *sym);
@@ -676,13 +678,13 @@ class SymbolTableSection final : public SymbolTableBaseSection {
   using Elf_Sym = typename ELFT::Sym;
 
 public:
-  SymbolTableSection(StringTableSection &strTabSec);
+  SymbolTableSection(Ctx &, StringTableSection &strTabSec);
   void writeTo(Ctx &, uint8_t *buf) override;
 };
 
 class SymtabShndxSection final : public SyntheticSection {
 public:
-  SymtabShndxSection();
+  SymtabShndxSection(Ctx &);
 
   void writeTo(Ctx &, uint8_t *buf) override;
   size_t getSize(Ctx &) const override;
@@ -694,7 +696,7 @@ class SymtabShndxSection final : public SyntheticSection {
 // https://blogs.oracle.com/ali/entry/gnu_hash_elf_sections
 class GnuHashTableSection final : public SyntheticSection {
 public:
-  GnuHashTableSection();
+  GnuHashTableSection(Ctx &);
   void finalizeContents(Ctx &) override;
   void writeTo(Ctx &, uint8_t *buf) override;
   size_t getSize(Ctx &ctx) const override { return size; }
@@ -722,7 +724,7 @@ class GnuHashTableSection final : public SyntheticSection {
 
 class HashTableSection final : public SyntheticSection {
 public:
-  HashTableSection();
+  HashTableSection(Ctx &);
   void finalizeContents(Ctx &) override;
   void writeTo(Ctx &, uint8_t *buf) override;
   size_t getSize(Ctx &ctx) const override { return size; }
@@ -744,7 +746,7 @@ class HashTableSection final : public SyntheticSection {
 // target (BIND_NOW) or a .plt entry.
 class PltSection : public SyntheticSection {
 public:
-  PltSection();
+  PltSection(Ctx &);
   void writeTo(Ctx &, uint8_t *buf) override;
   size_t getSize(Ctx &) const override;
   bool isNeeded(Ctx &) const override;
@@ -765,7 +767,7 @@ class IpltSection final : public SyntheticSection {
   SmallVector<const Symbol *, 0> entries;
 
 public:
-  IpltSection();
+  IpltSection(Ctx &);
   void writeTo(Ctx &, uint8_t *buf) override;
   size_t getSize(Ctx &) const override;
   bool isNeeded(Ctx &) const override { return !entries.empty(); }
@@ -775,7 +777,7 @@ class IpltSection final : public SyntheticSection {
 
 class PPC32GlinkSection : public PltSection {
 public:
-  PPC32GlinkSection();
+  PPC32GlinkSection(Ctx &);
   void writeTo(Ctx &, uint8_t *buf) override;
   size_t getSize(Ctx &) const override;
 
@@ -786,7 +788,7 @@ class PPC32GlinkSection : public PltSection {
 // This is x86-only.
 class IBTPltSection : public SyntheticSection {
 public:
-  IBTPltSection();
+  IBTPltSection(Ctx &);
   void writeTo(Ctx &, uint8_t *Buf) override;
   bool isNeeded(Ctx &) const override;
   size_t getSize(Ctx &) const override;
@@ -797,7 +799,7 @@ class IBTPltSection : public SyntheticSection {
 // pages in the PT_LOAD segment is covered by at least one section.
 class RelroPaddingSection final : public SyntheticSection {
 public:
-  RelroPaddingSection();
+  RelroPaddingSection(Ctx &);
   size_t getSize(Ctx &ctx) const override { return 0; }
   void writeTo(Ctx &, uint8_t *buf) override {}
 };
@@ -872,7 +874,7 @@ class DebugNamesBaseSection : public SyntheticSection {
     SmallVector<uint32_t, 0> compUnits;
   };
 
-  DebugNamesBaseSection();
+  DebugNamesBaseSection(Ctx &);
   size_t getSize(Ctx &ctx) const override { return size; }
   bool isNeeded(Ctx &) const override { return numChunks > 0; }
 
@@ -916,7 +918,7 @@ class DebugNamesBaseSection : public SyntheticSection {
 template <class ELFT>
 class DebugNamesSection final : public DebugNamesBaseSection {
 public:
-  DebugNamesSection();
+  DebugNamesSection(Ctx &);
   void finalizeContents(Ctx &) override;
   void writeTo(Ctx &, uint8_t *buf) override;
 
@@ -963,8 +965,9 @@ class GdbIndexSection final : public SyntheticSection {
     uint32_t cuVectorOff;
   };
 
-  GdbIndexSection();
-  template <typename ELFT> static std::unique_ptr<GdbIndexSection> create();
+  GdbIndexSection(Ctx &);
+  template <typename ELFT>
+  static std::unique_ptr<GdbIndexSection> create(Ctx &);
   void writeTo(Ctx &, uint8_t *buf) override;
   size_t getSize(Ctx &ctx) const override { return size; }
   bool isNeeded(Ctx &) const override;
@@ -1002,7 +1005,7 @@ class GdbIndexSection final : public SyntheticSection {
 // http://www.airs.com/blog/archives/462 (".eh_frame_hdr")
 class EhFrameHeader final : public SyntheticSection {
 public:
-  EhFrameHeader();
+  EhFrameHeader(Ctx &);
   void write();
   void writeTo(Ctx &, uint8_t *buf) override;
   size_t getSize(Ctx &) const override;
@@ -1019,7 +1022,7 @@ class EhFrameHeader final : public SyntheticSection {
 // followed by an array of Elf_Verdaux structures.
 class VersionDefinitionSection final : public SyntheticSection {
 public:
-  VersionDefinitionSection();
+  VersionDefinitionSection(Ctx &);
   void finalizeContents(Ctx &) override;
   size_t getSize(Ctx &) const override;
   void writeTo(Ctx &, uint8_t *buf) override;
@@ -1041,7 +1044,7 @@ class VersionDefinitionSection final : public SyntheticSection {
 // the own object or in any of the dependencies.
 class VersionTableSection final : public SyntheticSection {
 public:
-  VersionTableSection();
+  VersionTableSection(Ctx &);
   void finalizeContents(Ctx &) override;
   size_t getSize(Ctx &) const override;
   void writeTo(Ctx &, uint8_t *buf) override;
@@ -1072,7 +1075,7 @@ class VersionNeedSection final : public SyntheticSection {
   SmallVector<Verneed, 0> verneeds;
 
 public:
-  VersionNeedSection();
+  VersionNeedSection(Ctx &);
   void finalizeContents(Ctx &) override;
   void writeTo(Ctx &, uint8_t *buf) override;
   size_t getSize(Ctx &) const override;
@@ -1089,14 +1092,14 @@ class MergeSyntheticSection : public SyntheticSection {
   SmallVector<MergeInputSection *, 0> sections;
 
 protected:
-  MergeSyntheticSection(StringRef name, uint32_t type, uint64_t flags,
+  MergeSyntheticSection(Ctx &ctx, StringRef name, uint32_t type, uint64_t flags,
                         uint32_t addralign)
-      : SyntheticSection(flags, type, addralign, name) {}
+      : SyntheticSection(ctx, flags, type, addralign, name) {}
 };
 
 class MergeTailSection final : public MergeSyntheticSection {
 public:
-  MergeTailSection(StringRef name, uint32_t type, uint64_t flags,
+  MergeTailSection(Ctx &ctx, StringRef name, uint32_t type, uint64_t flags,
                    uint32_t addralign);
 
   size_t getSize(Ctx &) const override;
@@ -1109,9 +1112,9 @@ class MergeTailSection final : public MergeSyntheticSection {
 
 class MergeNoTailSection final : public MergeSyntheticSection {
 public:
-  MergeNoTailSection(StringRef name, uint32_t type, uint64_t flags,
+  MergeNoTailSection(Ctx &ctx, StringRef name, uint32_t type, uint64_t flags,
                      uint32_t addralign)
-      : MergeSyntheticSection(name, type, flags, addralign) {}
+      : MergeSyntheticSection(ctx, name, type, flags, addralign) {}
 
   size_t getSize(Ctx &ctx) const override { return size; }
   void writeTo(Ctx &, uint8_t *buf) override;
@@ -1145,7 +1148,7 @@ class MipsAbiFlagsSection final : public SyntheticSection {
 public:
   static std::unique_ptr<MipsAbiFlagsSection> create(Ctx &);
 
-  MipsAbiFlagsSection(Elf_Mips_ABIFlags flags);
+  MipsAbiFlagsSection(Ctx &, Elf_Mips_ABIFlags flags);
   size_t getSize(Ctx &ctx) const override { return sizeof(Elf_Mips_ABIFlags); }
   void writeTo(Ctx &, uint8_t *buf) override;
 
@@ -1161,7 +1164,7 @@ template <class ELFT> class MipsOptionsSection final : public SyntheticSection {
 public:
   static std::unique_ptr<MipsOptionsSection<ELFT>> create(Ctx &);
 
-  MipsOptionsSection(Elf_Mips_RegInfo reginfo);
+  MipsOptionsSection(Ctx &, Elf_Mips_RegInfo reginfo);
   void writeTo(Ctx &, uint8_t *buf) override;
 
   size_t getSize(Ctx &ctx) const override {
@@ -1179,7 +1182,7 @@ template <class ELFT> class MipsReginfoSection final : public SyntheticSection {
 public:
   static std::unique_ptr<MipsReginfoSection> create(Ctx &);
 
-  MipsReginfoSection(Elf_Mips_RegInfo reginfo);
+  MipsReginfoSection(Ctx &, Elf_Mips_RegInfo reginfo);
   size_t getSize(Ctx &ctx) const override { return sizeof(Elf_Mips_RegInfo); }
   void writeTo(Ctx &, uint8_t *buf) override;
 
@@ -1193,7 +1196,7 @@ template <class ELFT> class MipsReginfoSection final : public SyntheticSection {
 // ftp://www.linux-mips.org/pub/linux/mips/doc/ABI/mipsabi.pdf
 class MipsRldMapSection final : public SyntheticSection {
 public:
-  MipsRldMapSection();
+  MipsRldMapSection(Ctx &);
   size_t getSize(Ctx &ctx) const override { return ctx.arg.wordsize; }
   void writeTo(Ctx &, uint8_t *buf) override {}
 };
@@ -1234,7 +1237,7 @@ class MipsRldMapSection final : public SyntheticSection {
 // either find the .ARM.exidx section or know that we need to generate one.
 class ARMExidxSyntheticSection : public SyntheticSection {
 public:
-  ARMExidxSyntheticSection();
+  ARMExidxSyntheticSection(Ctx &);
 
   // Add an input section to the ARMExidxSyntheticSection. Returns whether the
   // section needs to be removed from the main input section list.
@@ -1281,7 +1284,7 @@ class ARMExidxSyntheticSection : public SyntheticSection {
 class ThunkSection final : public SyntheticSection {
 public:
   // ThunkSection in OS, with desired outSecOff of Off
-  ThunkSection(OutputSection *os, uint64_t off);
+  ThunkSection(Ctx &, OutputSection *os, uint64_t off);
 
   // Add a newly created Thunk to this container:
   // Thunk is given offset from start of this InputSection
@@ -1332,7 +1335,7 @@ class ArmCmseSGSection final : public SyntheticSection {
 // synthesize PLT entries for PPC32 Secure PLT ABI.
 class PPC32Got2Section final : public SyntheticSection {
 public:
-  PPC32Got2Section();
+  PPC32Got2Section(Ctx &);
   size_t getSize(Ctx &ctx) const override { return 0; }
   bool isNeeded(Ctx &) const override;
   void finalizeContents(Ctx &) override;
@@ -1346,7 +1349,7 @@ class PPC32Got2Section final : public SyntheticSection {
 // filled in by the dynamic linker.
 class PPC64LongBranchTargetSection final : public SyntheticSection {
 public:
-  PPC64LongBranchTargetSection();
+  PPC64LongBranchTargetSection(Ctx &);
   uint64_t getEntryVA(const Symbol *sym, int64_t addend);
   std::optional<uint32_t> addEntry(const Symbol *sym, int64_t addend);
   size_t getSize(Ctx &) const override;
@@ -1363,7 +1366,7 @@ class PPC64LongBranchTargetSection final : public SyntheticSection {
 template <typename ELFT>
 class PartitionElfHeaderSection final : public SyntheticSection {
 public:
-  PartitionElfHeaderSection();
+  PartitionElfHeaderSection(Ctx &);
   size_t getSize(Ctx &) const override;
   void writeTo(Ctx &, uint8_t *buf) override;
 };
@@ -1371,14 +1374,14 @@ class PartitionElfHeaderSection final : public SyntheticSection {
 template <typename ELFT>
 class PartitionProgramHeadersSection final : public SyntheticSection {
 public:
-  PartitionProgramHeadersSection();
+  PartitionProgramHeadersSection(Ctx &);
   size_t getSize(Ctx &) const override;
   void writeTo(Ctx &, uint8_t *buf) override;
 };
 
 class PartitionIndexSection final : public SyntheticSection {
 public:
-  PartitionIndexSection();
+  PartitionIndexSection(Ctx &);
   size_t getSize(Ctx &) const override;
   void finalizeContents(Ctx &) override;
   void writeTo(Ctx &, uint8_t *buf) override;
@@ -1389,8 +1392,8 @@ class PartitionIndexSection final : public SyntheticSection {
 // https://cs.android.com/android/platform/superproject/+/master:bionic/libc/bionic/libc_init_static.cpp;drc=9425b16978f9c5aa8f2c50c873db470819480d1d;l=192
 class MemtagAndroidNote final : public SyntheticSection {
 public:
-  MemtagAndroidNote()
-      : SyntheticSection(llvm::ELF::SHF_ALLOC, llvm::ELF::SHT_NOTE,
+  MemtagAndroidNote(Ctx &ctx)
+      : SyntheticSection(ctx, llvm::ELF::SHF_ALLOC, llvm::ELF::SHT_NOTE,
                          /*alignment=*/4, ".note.android.memtag") {}
   void writeTo(Ctx &, uint8_t *buf) override;
   size_t getSize(Ctx &) const override;
@@ -1398,8 +1401,8 @@ class MemtagAndroidNote final : public SyntheticSection {
 
 class PackageMetadataNote final : public SyntheticSection {
 public:
-  PackageMetadataNote()
-      : SyntheticSection(llvm::ELF::SHF_ALLOC, llvm::ELF::SHT_NOTE,
+  PackageMetadataNote(Ctx &ctx)
+      : SyntheticSection(ctx, llvm::ELF::SHF_ALLOC, llvm::ELF::SHT_NOTE,
                          /*alignment=*/4, ".note.package") {}
   void writeTo(Ctx &, uint8_t *buf) override;
   size_t getSize(Ctx &) const override;
@@ -1407,8 +1410,8 @@ class PackageMetadataNote final : public SyntheticSection {
 
 class MemtagGlobalDescriptors final : public SyntheticSection {
 public:
-  MemtagGlobalDescriptors()
-      : SyntheticSection(llvm::ELF::SHF_ALLOC,
+  MemtagGlobalDescriptors(Ctx &ctx)
+      : SyntheticSection(ctx, llvm::ELF::SHF_ALLOC,
                          llvm::ELF::SHT_AARCH64_MEMTAG_GLOBALS_DYNAMIC,
                          /*alignment=*/4, ".memtag.globals.dynamic") {}
   void writeTo(Ctx &, uint8_t *buf) override;

From 1fd79f105da64cec7986807c1d9c4896bd39dafa Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Fri, 11 Oct 2024 08:08:28 +0200
Subject: [PATCH 129/177] [clang][bytecode] Check number of addcarry/subborrow
 args (#111952)

Apparently this can fail as well.
---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 1765193f5bebbc..74e9e1cf629372 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -1287,7 +1287,7 @@ static bool interp__builtin_ia32_addcarry_subborrow(InterpState &S,
                                                     const InterpFrame *Frame,
                                                     const Function *Func,
                                                     const CallExpr *Call) {
-  if (!Call->getArg(0)->getType()->isIntegerType() ||
+  if (Call->getNumArgs() != 4 || !Call->getArg(0)->getType()->isIntegerType() ||
       !Call->getArg(1)->getType()->isIntegerType() ||
       !Call->getArg(2)->getType()->isIntegerType())
     return false;

From d91c103a107ab16b59c1bb67687233a1100d7ecf Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 10 Oct 2024 23:28:25 -0700
Subject: [PATCH 130/177] [ELF] Pass Ctx & to SyntheticSections

---
 lld/ELF/InputSection.h        |  4 +++-
 lld/ELF/SyntheticSections.cpp | 18 ++++++++++--------
 lld/ELF/SyntheticSections.h   |  4 ++--
 3 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/lld/ELF/InputSection.h b/lld/ELF/InputSection.h
index 543ff4db3c3270..bf2cf09f2921b2 100644
--- a/lld/ELF/InputSection.h
+++ b/lld/ELF/InputSection.h
@@ -472,10 +472,12 @@ static_assert(sizeof(InputSection) <= 160, "InputSection is too big");
 
 class SyntheticSection : public InputSection {
 public:
+  Ctx &ctx;
   SyntheticSection(Ctx &ctx, uint64_t flags, uint32_t type, uint32_t addralign,
                    StringRef name)
       : InputSection(ctx.internalFile, flags, type, addralign, {}, name,
-                     InputSectionBase::Synthetic) {}
+                     InputSectionBase::Synthetic),
+        ctx(ctx) {}
 
   virtual ~SyntheticSection() = default;
   virtual size_t getSize(Ctx &) const = 0;
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index a65c137762ce63..5d62f089e40848 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -59,7 +59,7 @@ using llvm::support::endian::write64le;
 
 constexpr size_t MergeNoTailSection::numShards;
 
-static uint64_t readUint(uint8_t *buf) {
+static uint64_t readUint(Ctx &ctx, uint8_t *buf) {
   return ctx.arg.is64 ? read64(buf) : read32(buf);
 }
 
@@ -267,7 +267,7 @@ MipsReginfoSection<ELFT>::create(Ctx &ctx) {
   return std::make_unique<MipsReginfoSection<ELFT>>(ctx, reginfo);
 }
 
-InputSection *elf::createInterpSection() {
+InputSection *elf::createInterpSection(Ctx &) {
   // StringSaver guarantees that the returned string ends with '\0'.
   StringRef s = saver().save(ctx.arg.dynamicLinker);
   ArrayRef<uint8_t> contents = {(const uint8_t *)s.data(), s.size() + 1};
@@ -609,7 +609,7 @@ static uint64_t readFdeAddr(uint8_t *buf, int size) {
   case DW_EH_PE_sdata8:
     return read64(buf);
   case DW_EH_PE_absptr:
-    return readUint(buf);
+    return readUint(ctx, buf);
   }
   fatal("unknown FDE size encoding");
 }
@@ -1452,7 +1452,8 @@ DynamicSection<ELFT>::computeContents() {
       addInSec(DT_PLTGOT, *ctx.in.plt);
       break;
     case EM_AARCH64:
-      if (llvm::find_if(ctx.in.relaPlt->relocs, [](const DynamicReloc &r) {
+      if (llvm::find_if(ctx.in.relaPlt->relocs, [&ctx = ctx](
+                                                    const DynamicReloc &r) {
             return r.type == ctx.target->pltRel &&
                    r.sym->stOther & STO_AARCH64_VARIANT_PCS;
           }) != ctx.in.relaPlt->relocs.end())
@@ -1460,7 +1461,8 @@ DynamicSection<ELFT>::computeContents() {
       addInSec(DT_PLTGOT, *ctx.in.gotPlt);
       break;
     case EM_RISCV:
-      if (llvm::any_of(ctx.in.relaPlt->relocs, [](const DynamicReloc &r) {
+      if (llvm::any_of(ctx.in.relaPlt->relocs, [&ctx = ctx](
+                                                   const DynamicReloc &r) {
             return r.type == ctx.target->pltRel &&
                    (r.sym->stOther & STO_RISCV_VARIANT_CC);
           }))
@@ -2441,7 +2443,7 @@ void GnuHashTableSection::writeTo(Ctx &ctx, uint8_t *buf) {
     // When C = 64, we choose a word with bits [6:...] and set 1 to two bits in
     // the word using bits [0:5] and [26:31].
     size_t i = (sym.hash / c) & (maskWords - 1);
-    uint64_t val = readUint(buf + i * ctx.arg.wordsize);
+    uint64_t val = readUint(ctx, buf + i * ctx.arg.wordsize);
     val |= uint64_t(1) << (sym.hash % c);
     val |= uint64_t(1) << ((sym.hash >> Shift2) % c);
     writeUint(buf + i * ctx.arg.wordsize, val);
@@ -3513,7 +3515,7 @@ createSymbols(
 
 // Returns a newly-created .gdb_index section.
 template <class ELFT>
-std::unique_ptr<GdbIndexSection> GdbIndexSection::create(Ctx &) {
+std::unique_ptr<GdbIndexSection> GdbIndexSection::create(Ctx &ctx) {
   llvm::TimeTraceScope timeScope("Create gdb index");
 
   // Collect InputFiles with .debug_info. See the comment in
@@ -4684,7 +4686,7 @@ template <class ELFT> void elf::createSyntheticSections(Ctx &ctx) {
   // SyntheticSections coming last.
   if (needsInterpSection(ctx)) {
     for (size_t i = 1; i <= ctx.partitions.size(); ++i) {
-      InputSection *sec = createInterpSection();
+      InputSection *sec = createInterpSection(ctx);
       sec->partition = i;
       ctx.inputSections.push_back(sec);
     }
diff --git a/lld/ELF/SyntheticSections.h b/lld/ELF/SyntheticSections.h
index b89860dd21371a..283b2953449e59 100644
--- a/lld/ELF/SyntheticSections.h
+++ b/lld/ELF/SyntheticSections.h
@@ -554,7 +554,7 @@ class RelocationBaseSection : public SyntheticSection {
            (d->type == llvm::ELF::SHT_RELA || d->type == llvm::ELF::SHT_REL ||
             d->type == llvm::ELF::SHT_RELR ||
             (d->type == llvm::ELF::SHT_AARCH64_AUTH_RELR &&
-             ctx.arg.emachine == llvm::ELF::EM_AARCH64));
+             elf::ctx.arg.emachine == llvm::ELF::EM_AARCH64));
   }
   int32_t dynamicTag, sizeDynamicTag;
   SmallVector<DynamicReloc, 0> relocs;
@@ -1433,7 +1433,7 @@ class MemtagGlobalDescriptors final : public SyntheticSection {
 };
 
 template <class ELFT> void createSyntheticSections(Ctx &);
-InputSection *createInterpSection();
+InputSection *createInterpSection(Ctx &);
 MergeInputSection *createCommentSection();
 template <class ELFT> void splitSections(Ctx &);
 void combineEhSections(Ctx &);

From d656b2063262d59c3565e63095104c01d1f6a5a3 Mon Sep 17 00:00:00 2001
From: Pierre van Houtryve <pierre.vanhoutryve@amd.com>
Date: Fri, 11 Oct 2024 08:37:20 +0200
Subject: [PATCH 131/177] [AMDGPU][SplitModule] Cleanup CallsExternal Handling
 (#106528)

- Don't treat inline ASM as indirect calls
- Remove call to alias testing, which was broken (only working by pure
luck right now) and isn't needed anyway. GlobalOpt should take care of
them for us.
---
 llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp  | 85 ++++++++++++++-----
 .../AMDGPU/indirect-call-inline-asm-debug.ll  | 28 ++++++
 .../AMDGPU/indirect-call-inline-asm.ll        | 30 +++++++
 .../AMDGPU/kernels-alias-dependencies.ll      | 41 ---------
 .../AMDGPU/kernels-dependency-indirect.ll     | 12 ---
 5 files changed, 121 insertions(+), 75 deletions(-)
 create mode 100644 llvm/test/tools/llvm-split/AMDGPU/indirect-call-inline-asm-debug.ll
 create mode 100644 llvm/test/tools/llvm-split/AMDGPU/indirect-call-inline-asm.ll
 delete mode 100644 llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
index e97a7f4e075f7f..a62c72d124825e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
@@ -43,6 +43,7 @@
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/User.h"
@@ -103,6 +104,11 @@ static cl::opt<bool> NoExternalizeGlobals(
     cl::desc("disables externalization of global variable with local linkage; "
              "may cause globals to be duplicated which increases binary size"));
 
+static cl::opt<bool> NoExternalizeOnAddrTaken(
+    "amdgpu-module-splitting-no-externalize-address-taken", cl::Hidden,
+    cl::desc(
+        "disables externalization of functions whose addresses are taken"));
+
 static cl::opt<std::string>
     ModuleDotCfgOutput("amdgpu-module-splitting-print-module-dotcfg",
                        cl::Hidden,
@@ -482,6 +488,9 @@ void SplitGraph::buildGraph(CallGraph &CG) {
       dbgs()
       << "[build graph] constructing graph representation of the input\n");
 
+  // FIXME(?): Is the callgraph really worth using if we have to iterate the
+  // function again whenever it fails to give us enough information?
+
   // We build the graph by just iterating all functions in the module and
   // working on their direct callees. At the end, all nodes should be linked
   // together as expected.
@@ -492,29 +501,52 @@ void SplitGraph::buildGraph(CallGraph &CG) {
       continue;
 
     // Look at direct callees and create the necessary edges in the graph.
-    bool HasIndirectCall = false;
-    Node &N = getNode(Cache, Fn);
+    SetVector<const Function *> DirectCallees;
+    bool CallsExternal = false;
     for (auto &CGEntry : *CG[&Fn]) {
       auto *CGNode = CGEntry.second;
-      auto *Callee = CGNode->getFunction();
-      if (!Callee) {
-        // TODO: Don't consider inline assembly as indirect calls.
-        if (CGNode == CG.getCallsExternalNode())
-          HasIndirectCall = true;
-        continue;
-      }
-
-      if (!Callee->isDeclaration())
-        createEdge(N, getNode(Cache, *Callee), EdgeKind::DirectCall);
+      if (auto *Callee = CGNode->getFunction()) {
+        if (!Callee->isDeclaration())
+          DirectCallees.insert(Callee);
+      } else if (CGNode == CG.getCallsExternalNode())
+        CallsExternal = true;
     }
 
     // Keep track of this function if it contains an indirect call and/or if it
     // can be indirectly called.
-    if (HasIndirectCall) {
-      LLVM_DEBUG(dbgs() << "indirect call found in " << Fn.getName() << "\n");
-      FnsWithIndirectCalls.push_back(&Fn);
+    if (CallsExternal) {
+      LLVM_DEBUG(dbgs() << "  [!] callgraph is incomplete for ";
+                 Fn.printAsOperand(dbgs());
+                 dbgs() << " - analyzing function\n");
+
+      bool HasIndirectCall = false;
+      for (const auto &Inst : instructions(Fn)) {
+        // look at all calls without a direct callee.
+        if (const auto *CB = dyn_cast<CallBase>(&Inst);
+            CB && !CB->getCalledFunction()) {
+          // inline assembly can be ignored, unless InlineAsmIsIndirectCall is
+          // true.
+          if (CB->isInlineAsm()) {
+            LLVM_DEBUG(dbgs() << "    found inline assembly\n");
+            continue;
+          }
+
+          // everything else is handled conservatively.
+          HasIndirectCall = true;
+          break;
+        }
+      }
+
+      if (HasIndirectCall) {
+        LLVM_DEBUG(dbgs() << "    indirect call found\n");
+        FnsWithIndirectCalls.push_back(&Fn);
+      }
     }
 
+    Node &N = getNode(Cache, Fn);
+    for (const auto *Callee : DirectCallees)
+      createEdge(N, getNode(Cache, *Callee), EdgeKind::DirectCall);
+
     if (canBeIndirectlyCalled(Fn))
       IndirectlyCallableFns.push_back(&Fn);
   }
@@ -1326,13 +1358,21 @@ static void splitAMDGPUModule(
   //
   // Additionally, it guides partitioning to not duplicate this function if it's
   // called directly at some point.
-  for (auto &Fn : M) {
-    if (Fn.hasAddressTaken()) {
-      if (Fn.hasLocalLinkage()) {
-        LLVM_DEBUG(dbgs() << "[externalize] " << Fn.getName()
-                          << " because its address is taken\n");
+  //
+  // TODO: Could we be smarter about this ? This makes all functions whose
+  // addresses are taken non-copyable. We should probably model this type of
+  // constraint in the graph and use it to guide splitting, instead of
+  // externalizing like this. Maybe non-copyable should really mean "keep one
+  // visible copy, then internalize all other copies" for some functions?
+  if (!NoExternalizeOnAddrTaken) {
+    for (auto &Fn : M) {
+      // TODO: Should aliases count? Probably not but they're so rare I'm not
+      // sure it's worth fixing.
+      if (Fn.hasLocalLinkage() && Fn.hasAddressTaken()) {
+        LLVM_DEBUG(dbgs() << "[externalize] "; Fn.printAsOperand(dbgs());
+                   dbgs() << " because its address is taken\n");
+        externalize(Fn);
       }
-      externalize(Fn);
     }
   }
 
@@ -1368,7 +1408,8 @@ static void splitAMDGPUModule(
     dbgs() << "[graph] nodes:\n";
     for (const SplitGraph::Node *N : SG.nodes()) {
       dbgs() << "  - [" << N->getID() << "]: " << N->getName() << " "
-             << (N->isGraphEntryPoint() ? "(entry)" : "") << "\n";
+             << (N->isGraphEntryPoint() ? "(entry)" : "") << " "
+             << (N->isNonCopyable() ? "(noncopyable)" : "") << "\n";
     }
   });
 
diff --git a/llvm/test/tools/llvm-split/AMDGPU/indirect-call-inline-asm-debug.ll b/llvm/test/tools/llvm-split/AMDGPU/indirect-call-inline-asm-debug.ll
new file mode 100644
index 00000000000000..5b15e740f76b96
--- /dev/null
+++ b/llvm/test/tools/llvm-split/AMDGPU/indirect-call-inline-asm-debug.ll
@@ -0,0 +1,28 @@
+; REQUIRES: asserts
+
+; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-no-externalize-address-taken -debug-only=amdgpu-split-module 2>&1 | FileCheck %s
+
+; CHECK:      [!] callgraph is incomplete for ptr @A  - analyzing function
+; CHECK-NEXT:     found inline assembly
+; CHECK-NOT:      indirect call found
+
+@addrthief = global [2 x ptr] [ptr @HelperA, ptr @HelperB]
+
+define internal void @HelperA() {
+  ret void
+}
+
+define internal void @HelperB() {
+  ret void
+}
+
+define amdgpu_kernel void @A() {
+  call void asm sideeffect "v_mov_b32 v0, 7", "~{v0}"()
+  call void @HelperA()
+  ret void
+}
+
+define amdgpu_kernel void @B(ptr %out) {
+  call void @HelperB()
+  ret void
+}
diff --git a/llvm/test/tools/llvm-split/AMDGPU/indirect-call-inline-asm.ll b/llvm/test/tools/llvm-split/AMDGPU/indirect-call-inline-asm.ll
new file mode 100644
index 00000000000000..13c30c9e45e808
--- /dev/null
+++ b/llvm/test/tools/llvm-split/AMDGPU/indirect-call-inline-asm.ll
@@ -0,0 +1,30 @@
+; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-no-externalize-address-taken
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
+
+; CHECK0: define internal void @HelperB
+; CHECK0: define amdgpu_kernel void @B
+
+; CHECK1: define internal void @HelperA()
+; CHECK1: define amdgpu_kernel void @A()
+
+@addrthief = global [2 x ptr] [ptr @HelperA, ptr @HelperB]
+
+define internal void @HelperA() {
+  ret void
+}
+
+define internal void @HelperB() {
+  ret void
+}
+
+define amdgpu_kernel void @A() {
+  call void asm sideeffect "v_mov_b32 v0, 7", "~{v0}"()
+  call void @HelperA()
+  ret void
+}
+
+define amdgpu_kernel void @B(ptr %out) {
+  call void @HelperB()
+  ret void
+}
diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll
deleted file mode 100644
index d7e84abd5f968d..00000000000000
--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
-
-; 3 kernels:
-;   - A calls nothing
-;   - B calls @PerryThePlatypus
-;   - C calls @Perry, an alias of @PerryThePlatypus
-;
-; We should see through the alias and put B/C in the same
-; partition.
-;
-; Additionally, @PerryThePlatypus gets externalized as
-; the alias counts as taking its address.
-
-; CHECK0: define amdgpu_kernel void @A
-
-; CHECK1: @Perry = internal alias ptr (), ptr @PerryThePlatypus
-; CHECK1: define hidden void @PerryThePlatypus()
-; CHECK1: define amdgpu_kernel void @B
-; CHECK1: define amdgpu_kernel void @C
-
-@Perry = internal alias ptr(), ptr @PerryThePlatypus
-
-define internal void @PerryThePlatypus() {
-  ret void
-}
-
-define amdgpu_kernel void @A() {
-  ret void
-}
-
-define amdgpu_kernel void @B() {
-  call void @PerryThePlatypus()
-  ret void
-}
-
-define amdgpu_kernel void @C() {
-  call void @Perry()
-  ret void
-}
diff --git a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll
index 5be945bda48bf4..c2acb06d3e72e5 100644
--- a/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll
+++ b/llvm/test/tools/llvm-split/AMDGPU/kernels-dependency-indirect.ll
@@ -3,18 +3,6 @@
 ; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
 ; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s
 
-; We have 4 kernels:
-;   - Each kernel has an internal helper
-;   - @A and @B's helpers does an indirect call.
-;
-; We default to putting A/B in P0, alongside a copy
-; of all helpers who have their address taken.
-; The other kernels can still go into separate partitions.
-;
-; Note that dependency discovery shouldn't stop upon finding an
-; indirect call. HelperC/D should also end up in P0 as they
-; are dependencies of HelperB.
-
 ; CHECK0: define internal void @HelperD
 ; CHECK0: define amdgpu_kernel void @D
 

From 81bd712f928b3c736d83252df75c1c1bd3374122 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 10 Oct 2024 23:43:21 -0700
Subject: [PATCH 132/177] [ELF] Revert Ctx & parameters from SyntheticSection

Since Ctx &ctx is a member variable,
1f391a75af8685e6bba89421443d72ac6a186599
7a5b9ef54eb96abd8415fd893576c42e51fd95db
e2f0ec3a3a8a2981be8a1aac2004cfb9064c61e8 can be reverted.
---
 lld/ELF/AArch64ErrataFix.cpp  |   8 +-
 lld/ELF/ARMErrataFix.cpp      |   8 +-
 lld/ELF/Arch/ARM.cpp          |  13 +-
 lld/ELF/Arch/RISCV.cpp        |   8 +-
 lld/ELF/InputSection.cpp      |   2 +-
 lld/ELF/InputSection.h        |   8 +-
 lld/ELF/LinkerScript.cpp      |   2 +-
 lld/ELF/OutputSections.cpp    |   4 +-
 lld/ELF/Relocations.cpp       |   9 +-
 lld/ELF/SyntheticSections.cpp | 229 ++++++++++++++--------------
 lld/ELF/SyntheticSections.h   | 275 +++++++++++++++++-----------------
 lld/ELF/Writer.cpp            |  18 +--
 12 files changed, 283 insertions(+), 301 deletions(-)

diff --git a/lld/ELF/AArch64ErrataFix.cpp b/lld/ELF/AArch64ErrataFix.cpp
index a5129c58da13d9..b1f6c424688b36 100644
--- a/lld/ELF/AArch64ErrataFix.cpp
+++ b/lld/ELF/AArch64ErrataFix.cpp
@@ -374,9 +374,9 @@ class elf::Patch843419Section final : public SyntheticSection {
 public:
   Patch843419Section(Ctx &, InputSection *p, uint64_t off);
 
-  void writeTo(Ctx &, uint8_t *buf) override;
+  void writeTo(uint8_t *buf) override;
 
-  size_t getSize(Ctx &) const override { return 8; }
+  size_t getSize() const override { return 8; }
 
   uint64_t getLDSTAddr() const;
 
@@ -399,7 +399,7 @@ Patch843419Section::Patch843419Section(Ctx &ctx, InputSection *p, uint64_t off)
   this->parent = p->getParent();
   patchSym = addSyntheticLocal(
       saver().save("__CortexA53843419_" + utohexstr(getLDSTAddr())), STT_FUNC,
-      0, getSize(ctx), *this);
+      0, getSize(), *this);
   addSyntheticLocal(saver().save("$x"), STT_NOTYPE, 0, 0, *this);
 }
 
@@ -407,7 +407,7 @@ uint64_t Patch843419Section::getLDSTAddr() const {
   return patchee->getVA(patcheeOffset);
 }
 
-void Patch843419Section::writeTo(Ctx &ctx, uint8_t *buf) {
+void Patch843419Section::writeTo(uint8_t *buf) {
   // Copy the instruction that we will be replacing with a branch in the
   // patchee Section.
   write32le(buf, read32le(patchee->content().begin() + patcheeOffset));
diff --git a/lld/ELF/ARMErrataFix.cpp b/lld/ELF/ARMErrataFix.cpp
index 57df542e57ec48..6dc6432c40ea5c 100644
--- a/lld/ELF/ARMErrataFix.cpp
+++ b/lld/ELF/ARMErrataFix.cpp
@@ -73,9 +73,9 @@ class elf::Patch657417Section final : public SyntheticSection {
   Patch657417Section(Ctx &, InputSection *p, uint64_t off, uint32_t instr,
                      bool isARM);
 
-  void writeTo(Ctx &, uint8_t *buf) override;
+  void writeTo(uint8_t *buf) override;
 
-  size_t getSize(Ctx &) const override { return 4; }
+  size_t getSize() const override { return 4; }
 
   // Get the virtual address of the branch instruction at patcheeOffset.
   uint64_t getBranchAddr() const;
@@ -142,7 +142,7 @@ Patch657417Section::Patch657417Section(Ctx &ctx, InputSection *p, uint64_t off,
   parent = p->getParent();
   patchSym = addSyntheticLocal(
       saver().save("__CortexA8657417_" + utohexstr(getBranchAddr())), STT_FUNC,
-      isARM ? 0 : 1, getSize(ctx), *this);
+      isARM ? 0 : 1, getSize(), *this);
   addSyntheticLocal(saver().save(isARM ? "$a" : "$t"), STT_NOTYPE, 0, 0, *this);
 }
 
@@ -176,7 +176,7 @@ static uint64_t getThumbDestAddr(Ctx &ctx, uint64_t sourceAddr,
   return sourceAddr + offset + 4;
 }
 
-void Patch657417Section::writeTo(Ctx &ctx, uint8_t *buf) {
+void Patch657417Section::writeTo(uint8_t *buf) {
   // The base instruction of the patch is always a 32-bit unconditional branch.
   if (isARM)
     write32le(buf, 0xea000000);
diff --git a/lld/ELF/Arch/ARM.cpp b/lld/ELF/Arch/ARM.cpp
index ecf293602ac99d..d964994a4b3cff 100644
--- a/lld/ELF/Arch/ARM.cpp
+++ b/lld/ELF/Arch/ARM.cpp
@@ -1333,8 +1333,7 @@ class elf::ArmCmseSGVeneer {
 ArmCmseSGSection::ArmCmseSGSection(Ctx &ctx)
     : SyntheticSection(ctx, llvm::ELF::SHF_ALLOC | llvm::ELF::SHF_EXECINSTR,
                        llvm::ELF::SHT_PROGBITS,
-                       /*alignment=*/32, ".gnu.sgstubs"),
-      ctx(ctx) {
+                       /*alignment=*/32, ".gnu.sgstubs") {
   entsize = ACLESESYM_SIZE;
   // The range of addresses used in the CMSE import library should be fixed.
   for (auto &[_, sym] : ctx.symtab->cmseImportLib) {
@@ -1384,7 +1383,7 @@ void ArmCmseSGSection::addSGVeneer(Symbol *acleSeSym, Symbol *sym) {
   sgVeneers.emplace_back(ss);
 }
 
-void ArmCmseSGSection::writeTo(Ctx &ctx, uint8_t *buf) {
+void ArmCmseSGSection::writeTo(uint8_t *buf) {
   for (ArmCmseSGVeneer *s : sgVeneers) {
     uint8_t *p = buf + s->offset;
     write16(p + 0, 0xe97f); // SG
@@ -1401,14 +1400,14 @@ void ArmCmseSGSection::addMappingSymbol() {
   addSyntheticLocal("$t", STT_NOTYPE, /*off=*/0, /*size=*/0, *this);
 }
 
-size_t ArmCmseSGSection::getSize(Ctx &) const {
+size_t ArmCmseSGSection::getSize() const {
   if (sgVeneers.empty())
     return (impLibMaxAddr ? impLibMaxAddr - getVA() : 0) + newEntries * entsize;
 
   return entries.size() * entsize;
 }
 
-void ArmCmseSGSection::finalizeContents(Ctx &) {
+void ArmCmseSGSection::finalizeContents() {
   if (sgVeneers.empty())
     return;
 
@@ -1476,8 +1475,8 @@ template <typename ELFT> void elf::writeARMCmseImportLib(Ctx &ctx) {
     osec->recordSection(isec);
     osec->finalizeInputSections(ctx);
     osec->shName = shstrtab->addString(osec->name);
-    osec->size = isec->getSize(ctx);
-    isec->finalizeContents(ctx);
+    osec->size = isec->getSize();
+    isec->finalizeContents();
     osec->offset = alignToPowerOf2(off, osec->addralign);
     off = osec->offset + osec->size;
   }
diff --git a/lld/ELF/Arch/RISCV.cpp b/lld/ELF/Arch/RISCV.cpp
index 351cca025b3864..d65467f10378be 100644
--- a/lld/ELF/Arch/RISCV.cpp
+++ b/lld/ELF/Arch/RISCV.cpp
@@ -1048,8 +1048,8 @@ class RISCVAttributesSection final : public SyntheticSection {
       : SyntheticSection(ctx, 0, SHT_RISCV_ATTRIBUTES, 1, ".riscv.attributes") {
   }
 
-  size_t getSize(Ctx &) const override { return size; }
-  void writeTo(Ctx &, uint8_t *buf) override;
+  size_t getSize() const override { return size; }
+  void writeTo(uint8_t *buf) override;
 
   static constexpr StringRef vendor = "riscv";
   DenseMap<unsigned, unsigned> intAttr;
@@ -1278,8 +1278,8 @@ mergeAttributesSection(Ctx &ctx,
   return &merged;
 }
 
-void RISCVAttributesSection::writeTo(Ctx &ctx, uint8_t *buf) {
-  const size_t size = getSize(ctx);
+void RISCVAttributesSection::writeTo(uint8_t *buf) {
+  const size_t size = getSize();
   uint8_t *const end = buf + size;
   *buf = ELFAttrs::Format_Version;
   write32(buf + 1, size - 1);
diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index 90716f4f3675cc..082fdb9f5c9ac4 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -101,7 +101,7 @@ InputSectionBase::InputSectionBase(ObjFile<ELFT> &file,
 
 size_t InputSectionBase::getSize() const {
   if (auto *s = dyn_cast<SyntheticSection>(this))
-    return s->getSize(ctx);
+    return s->getSize();
   return size - bytesDropped;
 }
 
diff --git a/lld/ELF/InputSection.h b/lld/ELF/InputSection.h
index bf2cf09f2921b2..2b34047bc0682a 100644
--- a/lld/ELF/InputSection.h
+++ b/lld/ELF/InputSection.h
@@ -480,13 +480,13 @@ class SyntheticSection : public InputSection {
         ctx(ctx) {}
 
   virtual ~SyntheticSection() = default;
-  virtual size_t getSize(Ctx &) const = 0;
+  virtual size_t getSize() const = 0;
   virtual bool updateAllocSize(Ctx &) { return false; }
   // If the section has the SHF_ALLOC flag and the size may be changed if
   // thunks are added, update the section size.
-  virtual bool isNeeded(Ctx &) const { return true; }
-  virtual void finalizeContents(Ctx &) {}
-  virtual void writeTo(Ctx &, uint8_t *buf) = 0;
+  virtual bool isNeeded() const { return true; }
+  virtual void finalizeContents() {}
+  virtual void writeTo(uint8_t *buf) = 0;
 
   static bool classof(const SectionBase *sec) {
     return sec->kind() == InputSectionBase::Synthetic;
diff --git a/lld/ELF/LinkerScript.cpp b/lld/ELF/LinkerScript.cpp
index e9a637bac4e9bd..b736cb1beef37e 100644
--- a/lld/ELF/LinkerScript.cpp
+++ b/lld/ELF/LinkerScript.cpp
@@ -1058,7 +1058,7 @@ void LinkerScript::diagnoseOrphanHandling() const {
 }
 
 void LinkerScript::diagnoseMissingSGSectionAddress() const {
-  if (!ctx.arg.cmseImplib || !ctx.in.armCmseSGSection->isNeeded(ctx))
+  if (!ctx.arg.cmseImplib || !ctx.in.armCmseSGSection->isNeeded())
     return;
 
   OutputSection *sec = findByName(sectionCommands, ".gnu.sgstubs");
diff --git a/lld/ELF/OutputSections.cpp b/lld/ELF/OutputSections.cpp
index 3f3b80830d80d5..7a65858a6f8c18 100644
--- a/lld/ELF/OutputSections.cpp
+++ b/lld/ELF/OutputSections.cpp
@@ -248,7 +248,7 @@ void OutputSection::finalizeInputSections(Ctx &ctx) {
       commitSection(ctx, s);
   }
   for (auto *ms : mergeSections)
-    ms->finalizeContents(ctx);
+    ms->finalizeContents();
 }
 
 static void sortByOrder(MutableArrayRef<InputSection *> in,
@@ -525,7 +525,7 @@ void OutputSection::writeTo(Ctx &ctx, uint8_t *buf, parallel::TaskGroup &tg) {
     for (size_t i = begin; i != end; ++i) {
       InputSection *isec = sections[i];
       if (auto *s = dyn_cast<SyntheticSection>(isec))
-        s->writeTo(ctx, buf + isec->outSecOff);
+        s->writeTo(buf + isec->outSecOff);
       else
         isec->writeTo<ELFT>(buf + isec->outSecOff);
 
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index 5d81d0cccb78e5..cb33f35e59e43b 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -2020,15 +2020,14 @@ static void forEachInputSectionDescription(
 // This may invalidate any output section offsets stored outside of InputSection
 void ThunkCreator::mergeThunks(ArrayRef<OutputSection *> outputSections) {
   forEachInputSectionDescription(
-      outputSections,
-      [&, &ctx = ctx](OutputSection *os, InputSectionDescription *isd) {
+      outputSections, [&](OutputSection *os, InputSectionDescription *isd) {
         if (isd->thunkSections.empty())
           return;
 
         // Remove any zero sized precreated Thunks.
         llvm::erase_if(isd->thunkSections,
-                       [&ctx](const std::pair<ThunkSection *, uint32_t> &ts) {
-                         return ts.first->getSize(ctx) == 0;
+                       [](const std::pair<ThunkSection *, uint32_t> &ts) {
+                         return ts.first->getSize() == 0;
                        });
 
         // ISD->ThunkSections contains all created ThunkSections, including
@@ -2081,7 +2080,7 @@ ThunkSection *ThunkCreator::getISDThunkSec(OutputSection *os,
   for (std::pair<ThunkSection *, uint32_t> tp : isd->thunkSections) {
     ThunkSection *ts = tp.first;
     uint64_t tsBase = os->addr + ts->outSecOff - pcBias;
-    uint64_t tsLimit = tsBase + ts->getSize(ctx);
+    uint64_t tsLimit = tsBase + ts->getSize();
     if (ctx.target->inBranchRange(rel.type, src,
                                   (src > tsLimit) ? tsBase : tsLimit))
       return ts;
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index 5d62f089e40848..ee0e9c513740ac 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -103,8 +103,7 @@ MipsAbiFlagsSection<ELFT>::MipsAbiFlagsSection(Ctx &ctx,
   this->entsize = sizeof(Elf_Mips_ABIFlags);
 }
 
-template <class ELFT>
-void MipsAbiFlagsSection<ELFT>::writeTo(Ctx &ctx, uint8_t *buf) {
+template <class ELFT> void MipsAbiFlagsSection<ELFT>::writeTo(uint8_t *buf) {
   memcpy(buf, &flags, sizeof(flags));
 }
 
@@ -165,11 +164,10 @@ MipsOptionsSection<ELFT>::MipsOptionsSection(Ctx &ctx, Elf_Mips_RegInfo reginfo)
   this->entsize = sizeof(Elf_Mips_Options) + sizeof(Elf_Mips_RegInfo);
 }
 
-template <class ELFT>
-void MipsOptionsSection<ELFT>::writeTo(Ctx &ctx, uint8_t *buf) {
+template <class ELFT> void MipsOptionsSection<ELFT>::writeTo(uint8_t *buf) {
   auto *options = reinterpret_cast<Elf_Mips_Options *>(buf);
   options->kind = ODK_REGINFO;
-  options->size = getSize(ctx);
+  options->size = getSize();
 
   if (!ctx.arg.relocatable)
     reginfo.ri_gp_value = ctx.in.mipsGot->getGp();
@@ -228,8 +226,7 @@ MipsReginfoSection<ELFT>::MipsReginfoSection(Ctx &ctx, Elf_Mips_RegInfo reginfo)
   this->entsize = sizeof(Elf_Mips_RegInfo);
 }
 
-template <class ELFT>
-void MipsReginfoSection<ELFT>::writeTo(Ctx &ctx, uint8_t *buf) {
+template <class ELFT> void MipsReginfoSection<ELFT>::writeTo(uint8_t *buf) {
   if (!ctx.arg.relocatable)
     reginfo.ri_gp_value = ctx.in.mipsGot->getGp();
   memcpy(buf, &reginfo, sizeof(reginfo));
@@ -324,9 +321,9 @@ GnuPropertySection::GnuPropertySection(Ctx &ctx)
     : SyntheticSection(ctx, llvm::ELF::SHF_ALLOC, llvm::ELF::SHT_NOTE,
                        ctx.arg.wordsize, ".note.gnu.property") {}
 
-void GnuPropertySection::writeTo(Ctx &ctx, uint8_t *buf) {
+void GnuPropertySection::writeTo(uint8_t *buf) {
   write32(buf, 4);                          // Name size
-  write32(buf + 4, getSize(ctx) - 16);      // Content size
+  write32(buf + 4, getSize() - 16);         // Content size
   write32(buf + 8, NT_GNU_PROPERTY_TYPE_0); // Type
   memcpy(buf + 12, "GNU", 4);               // Name string
 
@@ -352,7 +349,7 @@ void GnuPropertySection::writeTo(Ctx &ctx, uint8_t *buf) {
   }
 }
 
-size_t GnuPropertySection::getSize(Ctx &ctx) const {
+size_t GnuPropertySection::getSize() const {
   uint32_t contentSize = 0;
   if (ctx.arg.andFeatures != 0)
     contentSize += ctx.arg.is64 ? 16 : 12;
@@ -366,7 +363,7 @@ BuildIdSection::BuildIdSection(Ctx &ctx)
     : SyntheticSection(ctx, SHF_ALLOC, SHT_NOTE, 4, ".note.gnu.build-id"),
       hashSize(getHashSize()) {}
 
-void BuildIdSection::writeTo(Ctx &ctx, uint8_t *buf) {
+void BuildIdSection::writeTo(uint8_t *buf) {
   write32(buf, 4);                      // Name size
   write32(buf + 4, hashSize);           // Content size
   write32(buf + 8, NT_GNU_BUILD_ID);    // Type
@@ -514,7 +511,7 @@ static void writeCieFde(uint8_t *buf, ArrayRef<uint8_t> d) {
   write32(buf, d.size() - 4);
 }
 
-void EhFrameSection::finalizeContents(Ctx &) {
+void EhFrameSection::finalizeContents() {
   assert(!this->size); // Not finalized.
 
   switch (ctx.arg.ekind) {
@@ -630,7 +627,7 @@ uint64_t EhFrameSection::getFdePc(uint8_t *buf, size_t fdeOff,
   fatal("unknown FDE size relative encoding");
 }
 
-void EhFrameSection::writeTo(Ctx &ctx, uint8_t *buf) {
+void EhFrameSection::writeTo(uint8_t *buf) {
   // Write CIE and FDE records.
   for (CieRecord *rec : cieRecords) {
     size_t cieOffset = rec->cie->outputOff;
@@ -709,7 +706,7 @@ uint64_t GotSection::getGlobalDynOffset(const Symbol &b) const {
   return b.getTlsGdIdx(ctx) * ctx.arg.wordsize;
 }
 
-void GotSection::finalizeContents(Ctx &) {
+void GotSection::finalizeContents() {
   if (ctx.arg.emachine == EM_PPC64 &&
       numEntries <= ctx.target->gotHeaderEntriesNum &&
       !ctx.sym.globalOffsetTable)
@@ -718,13 +715,13 @@ void GotSection::finalizeContents(Ctx &) {
     size = numEntries * ctx.arg.wordsize;
 }
 
-bool GotSection::isNeeded(Ctx &ctx) const {
+bool GotSection::isNeeded() const {
   // Needed if the GOT symbol is used or the number of entries is more than just
   // the header. A GOT with just the header may not be needed.
   return hasGotOffRel || numEntries > ctx.target->gotHeaderEntriesNum;
 }
 
-void GotSection::writeTo(Ctx &ctx, uint8_t *buf) {
+void GotSection::writeTo(uint8_t *buf) {
   // On PPC64 .got may be needed but empty. Skip the write.
   if (size == 0)
     return;
@@ -878,7 +875,7 @@ bool MipsGotSection::tryMergeGots(FileGot &dst, FileGot &src, bool isPrimary) {
   return true;
 }
 
-void MipsGotSection::finalizeContents(Ctx &) { updateAllocSize(ctx); }
+void MipsGotSection::finalizeContents() { updateAllocSize(ctx); }
 
 bool MipsGotSection::updateAllocSize(Ctx &ctx) {
   size = headerEntriesNum * ctx.arg.wordsize;
@@ -1090,7 +1087,7 @@ void MipsGotSection::build() {
   }
 }
 
-bool MipsGotSection::isNeeded(Ctx &ctx) const {
+bool MipsGotSection::isNeeded() const {
   // We add the .got section to the result for dynamic MIPS target because
   // its address and properties are mentioned in the .dynamic section.
   return !ctx.arg.relocatable;
@@ -1105,7 +1102,7 @@ uint64_t MipsGotSection::getGp(const InputFile *f) const {
   return getVA() + gots[f->mipsGotIndex].startIndex * ctx.arg.wordsize + 0x7ff0;
 }
 
-void MipsGotSection::writeTo(Ctx &ctx, uint8_t *buf) {
+void MipsGotSection::writeTo(uint8_t *buf) {
   // Set the MSB of the second GOT slot. This is not required by any
   // MIPS ABI documentation, though.
   //
@@ -1189,12 +1186,12 @@ void GotPltSection::addEntry(Symbol &sym) {
   entries.push_back(&sym);
 }
 
-size_t GotPltSection::getSize(Ctx &ctx) const {
+size_t GotPltSection::getSize() const {
   return (ctx.target->gotPltHeaderEntriesNum + entries.size()) *
          ctx.target->gotEntrySize;
 }
 
-void GotPltSection::writeTo(Ctx &ctx, uint8_t *buf) {
+void GotPltSection::writeTo(uint8_t *buf) {
   ctx.target->writeGotPltHeader(buf);
   buf += ctx.target->gotPltHeaderEntriesNum * ctx.target->gotEntrySize;
   for (const Symbol *b : entries) {
@@ -1203,7 +1200,7 @@ void GotPltSection::writeTo(Ctx &ctx, uint8_t *buf) {
   }
 }
 
-bool GotPltSection::isNeeded(Ctx &) const {
+bool GotPltSection::isNeeded() const {
   // We need to emit GOTPLT even if it's empty if there's a relocation relative
   // to it.
   return !entries.empty() || hasGotPltOffRel;
@@ -1234,11 +1231,11 @@ void IgotPltSection::addEntry(Symbol &sym) {
   entries.push_back(&sym);
 }
 
-size_t IgotPltSection::getSize(Ctx &ctx) const {
+size_t IgotPltSection::getSize() const {
   return entries.size() * ctx.target->gotEntrySize;
 }
 
-void IgotPltSection::writeTo(Ctx &ctx, uint8_t *buf) {
+void IgotPltSection::writeTo(uint8_t *buf) {
   for (const Symbol *b : entries) {
     ctx.target->writeIgotPlt(buf, *b);
     buf += ctx.target->gotEntrySize;
@@ -1273,7 +1270,7 @@ unsigned StringTableSection::addString(StringRef s, bool hashIt) {
   return ret;
 }
 
-void StringTableSection::writeTo(Ctx &ctx, uint8_t *buf) {
+void StringTableSection::writeTo(uint8_t *buf) {
   for (StringRef s : strings) {
     memcpy(buf, s.data(), s.size());
     buf[s.size()] = '\0';
@@ -1308,9 +1305,9 @@ DynamicSection<ELFT>::DynamicSection(Ctx &ctx)
 //
 // DT_RELASZ is the total size of the included sections.
 static uint64_t addRelaSz(const RelocationBaseSection &relaDyn) {
-  size_t size = relaDyn.getSize(ctx);
+  size_t size = relaDyn.getSize();
   if (ctx.in.relaPlt->getParent() == relaDyn.getParent())
-    size += ctx.in.relaPlt->getSize(ctx);
+    size += ctx.in.relaPlt->getSize();
   return size;
 }
 
@@ -1318,7 +1315,7 @@ static uint64_t addRelaSz(const RelocationBaseSection &relaDyn) {
 // output section. When this occurs we cannot just use the OutputSection
 // Size. Moreover the [DT_JMPREL, DT_JMPREL + DT_PLTRELSZ) is permitted to
 // overlap with the [DT_RELA, DT_RELA + DT_RELASZ).
-static uint64_t addPltRelSz() { return ctx.in.relaPlt->getSize(ctx); }
+static uint64_t addPltRelSz() { return ctx.in.relaPlt->getSize(); }
 
 // Add remaining entries to complete .dynamic contents.
 template <class ELFT>
@@ -1405,7 +1402,7 @@ DynamicSection<ELFT>::computeContents() {
   if (!ctx.arg.shared && !ctx.arg.relocatable && !ctx.arg.zRodynamic)
     addInt(DT_DEBUG, 0);
 
-  if (part.relaDyn->isNeeded(ctx)) {
+  if (part.relaDyn->isNeeded()) {
     addInSec(part.relaDyn->dynamicTag, *part.relaDyn);
     entries.emplace_back(part.relaDyn->sizeDynamicTag,
                          addRelaSz(*part.relaDyn));
@@ -1438,7 +1435,7 @@ DynamicSection<ELFT>::computeContents() {
     addInt(DT_AARCH64_AUTH_RELRSZ, part.relrAuthDyn->getParent()->size);
     addInt(DT_AARCH64_AUTH_RELRENT, sizeof(Elf_Relr));
   }
-  if (isMain && ctx.in.relaPlt->isNeeded(ctx)) {
+  if (isMain && ctx.in.relaPlt->isNeeded()) {
     addInSec(DT_JMPREL, *ctx.in.relaPlt);
     entries.emplace_back(DT_PLTRELSZ, addPltRelSz());
     switch (ctx.arg.emachine) {
@@ -1485,11 +1482,11 @@ DynamicSection<ELFT>::computeContents() {
       addInt(DT_AARCH64_MEMTAG_MODE, ctx.arg.androidMemtagMode == NT_MEMTAG_LEVEL_ASYNC);
       addInt(DT_AARCH64_MEMTAG_HEAP, ctx.arg.androidMemtagHeap);
       addInt(DT_AARCH64_MEMTAG_STACK, ctx.arg.androidMemtagStack);
-      if (ctx.mainPart->memtagGlobalDescriptors->isNeeded(ctx)) {
+      if (ctx.mainPart->memtagGlobalDescriptors->isNeeded()) {
         addInSec(DT_AARCH64_MEMTAG_GLOBALS,
                  *ctx.mainPart->memtagGlobalDescriptors);
         addInt(DT_AARCH64_MEMTAG_GLOBALSSZ,
-               ctx.mainPart->memtagGlobalDescriptors->getSize(ctx));
+               ctx.mainPart->memtagGlobalDescriptors->getSize());
       }
     }
   }
@@ -1497,7 +1494,7 @@ DynamicSection<ELFT>::computeContents() {
   addInSec(DT_SYMTAB, *part.dynSymTab);
   addInt(DT_SYMENT, sizeof(Elf_Sym));
   addInSec(DT_STRTAB, *part.dynStrTab);
-  addInt(DT_STRSZ, part.dynStrTab->getSize(ctx));
+  addInt(DT_STRSZ, part.dynStrTab->getSize());
   if (!ctx.arg.zText)
     addInt(DT_TEXTREL, 0);
   if (part.gnuHashTab && part.gnuHashTab->getParent())
@@ -1527,13 +1524,13 @@ DynamicSection<ELFT>::computeContents() {
         addInt(DT_FINI, b->getVA());
   }
 
-  if (part.verSym && part.verSym->isNeeded(ctx))
+  if (part.verSym && part.verSym->isNeeded())
     addInSec(DT_VERSYM, *part.verSym);
   if (part.verDef && part.verDef->isLive()) {
     addInSec(DT_VERDEF, *part.verDef);
     addInt(DT_VERDEFNUM, getVerDefNum());
   }
-  if (part.verNeed && part.verNeed->isNeeded(ctx)) {
+  if (part.verNeed && part.verNeed->isNeeded()) {
     addInSec(DT_VERNEED, *part.verNeed);
     unsigned needNum = 0;
     for (SharedFile *f : ctx.sharedFiles)
@@ -1570,7 +1567,7 @@ DynamicSection<ELFT>::computeContents() {
     addInSec(DT_PPC_GOT, *ctx.in.got);
 
   // Glink dynamic tag is required by the V2 abi if the plt section isn't empty.
-  if (ctx.arg.emachine == EM_PPC64 && ctx.in.plt->isNeeded(ctx)) {
+  if (ctx.arg.emachine == EM_PPC64 && ctx.in.plt->isNeeded()) {
     // The Glink tag points to 32 bytes before the first lazy symbol resolution
     // stub, which starts directly after the header.
     addInt(DT_PPC64_GLINK,
@@ -1584,14 +1581,13 @@ DynamicSection<ELFT>::computeContents() {
   return entries;
 }
 
-template <class ELFT> void DynamicSection<ELFT>::finalizeContents(Ctx &) {
+template <class ELFT> void DynamicSection<ELFT>::finalizeContents() {
   if (OutputSection *sec = getPartition().dynStrTab->getParent())
     getParent()->link = sec->sectionIndex;
   this->size = computeContents().size() * this->entsize;
 }
 
-template <class ELFT>
-void DynamicSection<ELFT>::writeTo(Ctx &ctx, uint8_t *buf) {
+template <class ELFT> void DynamicSection<ELFT>::writeTo(uint8_t *buf) {
   auto *p = reinterpret_cast<Elf_Dyn *>(buf);
 
   for (std::pair<int32_t, uint64_t> kv : computeContents()) {
@@ -1686,7 +1682,7 @@ void RelocationBaseSection::partitionRels() {
       relocs.begin();
 }
 
-void RelocationBaseSection::finalizeContents(Ctx &) {
+void RelocationBaseSection::finalizeContents() {
   SymbolTableBaseSection *symTab = getPartition().dynSymTab.get();
 
   // When linking glibc statically, .rel{,a}.plt contains R_*_IRELATIVE
@@ -1743,8 +1739,7 @@ RelocationSection<ELFT>::RelocationSection(Ctx &ctx, StringRef name,
   this->entsize = ctx.arg.isRela ? sizeof(Elf_Rela) : sizeof(Elf_Rel);
 }
 
-template <class ELFT>
-void RelocationSection<ELFT>::writeTo(Ctx &ctx, uint8_t *buf) {
+template <class ELFT> void RelocationSection<ELFT>::writeTo(uint8_t *buf) {
   computeRels();
   for (const DynamicReloc &rel : relocs) {
     auto *p = reinterpret_cast<Elf_Rela *>(buf);
@@ -2142,7 +2137,7 @@ static bool sortMipsSymbols(const SymbolTableEntry &l,
   return !l.sym->isInGot(ctx);
 }
 
-void SymbolTableBaseSection::finalizeContents(Ctx &) {
+void SymbolTableBaseSection::finalizeContents() {
   if (OutputSection *sec = strTabSec.getParent())
     getParent()->link = sec->sectionIndex;
 
@@ -2259,8 +2254,7 @@ static uint32_t getSymSectionIndex(Symbol *sym) {
 }
 
 // Write the internal symbol table contents to the output symbol table.
-template <class ELFT>
-void SymbolTableSection<ELFT>::writeTo(Ctx &ctx, uint8_t *buf) {
+template <class ELFT> void SymbolTableSection<ELFT>::writeTo(uint8_t *buf) {
   // The first entry is a null entry as per the ELF spec.
   buf += sizeof(Elf_Sym);
 
@@ -2342,7 +2336,7 @@ SymtabShndxSection::SymtabShndxSection(Ctx &ctx)
   this->entsize = 4;
 }
 
-void SymtabShndxSection::writeTo(Ctx &ctx, uint8_t *buf) {
+void SymtabShndxSection::writeTo(uint8_t *buf) {
   // We write an array of 32 bit values, where each value has 1:1 association
   // with an entry in ctx.in.symTab if the corresponding entry contains
   // SHN_XINDEX, we need to write actual index, otherwise, we must write
@@ -2355,7 +2349,7 @@ void SymtabShndxSection::writeTo(Ctx &ctx, uint8_t *buf) {
   }
 }
 
-bool SymtabShndxSection::isNeeded(Ctx &ctx) const {
+bool SymtabShndxSection::isNeeded() const {
   // SHT_SYMTAB can hold symbols with section indices values up to
   // SHN_LORESERVE. If we need more, we want to use extension SHT_SYMTAB_SHNDX
   // section. Problem is that we reveal the final section indices a bit too
@@ -2368,11 +2362,11 @@ bool SymtabShndxSection::isNeeded(Ctx &ctx) const {
   return size >= SHN_LORESERVE;
 }
 
-void SymtabShndxSection::finalizeContents(Ctx &) {
+void SymtabShndxSection::finalizeContents() {
   getParent()->link = ctx.in.symTab->getParent()->sectionIndex;
 }
 
-size_t SymtabShndxSection::getSize(Ctx &ctx) const {
+size_t SymtabShndxSection::getSize() const {
   return ctx.in.symTab->getNumSymbols() * 4;
 }
 
@@ -2410,7 +2404,7 @@ GnuHashTableSection::GnuHashTableSection(Ctx &ctx)
     : SyntheticSection(ctx, SHF_ALLOC, SHT_GNU_HASH, ctx.arg.wordsize,
                        ".gnu.hash") {}
 
-void GnuHashTableSection::finalizeContents(Ctx &) {
+void GnuHashTableSection::finalizeContents() {
   if (OutputSection *sec = getPartition().dynSymTab->getParent())
     getParent()->link = sec->sectionIndex;
 
@@ -2429,7 +2423,7 @@ void GnuHashTableSection::finalizeContents(Ctx &) {
   size += symbols.size() * 4;           // Hash values
 }
 
-void GnuHashTableSection::writeTo(Ctx &ctx, uint8_t *buf) {
+void GnuHashTableSection::writeTo(uint8_t *buf) {
   // Write a header.
   write32(buf, nBuckets);
   write32(buf + 4, getPartition().dynSymTab->getNumSymbols() - symbols.size());
@@ -2520,7 +2514,7 @@ HashTableSection::HashTableSection(Ctx &ctx)
   this->entsize = 4;
 }
 
-void HashTableSection::finalizeContents(Ctx &) {
+void HashTableSection::finalizeContents() {
   SymbolTableBaseSection *symTab = getPartition().dynSymTab.get();
 
   if (OutputSection *sec = symTab->getParent())
@@ -2534,7 +2528,7 @@ void HashTableSection::finalizeContents(Ctx &) {
   this->size = numEntries * 4;
 }
 
-void HashTableSection::writeTo(Ctx &ctx, uint8_t *buf) {
+void HashTableSection::writeTo(uint8_t *buf) {
   SymbolTableBaseSection *symTab = getPartition().dynSymTab.get();
   unsigned numSymbols = symTab->getNumSymbols();
 
@@ -2577,7 +2571,7 @@ PltSection::PltSection(Ctx &ctx)
     this->flags |= SHF_WRITE;
 }
 
-void PltSection::writeTo(Ctx &ctx, uint8_t *buf) {
+void PltSection::writeTo(uint8_t *buf) {
   // At beginning of PLT, we have code to call the dynamic
   // linker to resolve dynsyms at runtime. Write such code.
   ctx.target->writePltHeader(buf);
@@ -2595,14 +2589,13 @@ void PltSection::addEntry(Symbol &sym) {
   entries.push_back(&sym);
 }
 
-size_t PltSection::getSize(Ctx &ctx) const {
+size_t PltSection::getSize() const {
   return headerSize + entries.size() * ctx.target->pltEntrySize;
 }
 
-bool PltSection::isNeeded(Ctx &ctx) const {
+bool PltSection::isNeeded() const {
   // For -z retpolineplt, .iplt needs the .plt header.
-  return !entries.empty() ||
-         (ctx.arg.zRetpolineplt && ctx.in.iplt->isNeeded(ctx));
+  return !entries.empty() || (ctx.arg.zRetpolineplt && ctx.in.iplt->isNeeded());
 }
 
 // Used by ARM to add mapping symbols in the PLT section, which aid
@@ -2626,7 +2619,7 @@ IpltSection::IpltSection(Ctx &ctx)
   }
 }
 
-void IpltSection::writeTo(Ctx &ctx, uint8_t *buf) {
+void IpltSection::writeTo(uint8_t *buf) {
   uint32_t off = 0;
   for (const Symbol *sym : entries) {
     ctx.target->writeIplt(buf + off, *sym, getVA() + off);
@@ -2634,7 +2627,7 @@ void IpltSection::writeTo(Ctx &ctx, uint8_t *buf) {
   }
 }
 
-size_t IpltSection::getSize(Ctx &ctx) const {
+size_t IpltSection::getSize() const {
   return entries.size() * ctx.target->ipltEntrySize;
 }
 
@@ -2658,11 +2651,11 @@ PPC32GlinkSection::PPC32GlinkSection(Ctx &ctx) : PltSection(ctx) {
   addralign = 4;
 }
 
-void PPC32GlinkSection::writeTo(Ctx &ctx, uint8_t *buf) {
+void PPC32GlinkSection::writeTo(uint8_t *buf) {
   writePPC32GlinkSection(ctx, buf, entries.size());
 }
 
-size_t PPC32GlinkSection::getSize(Ctx &ctx) const {
+size_t PPC32GlinkSection::getSize() const {
   return headerSize + entries.size() * ctx.target->pltEntrySize + footerSize;
 }
 
@@ -2728,18 +2721,16 @@ IBTPltSection::IBTPltSection(Ctx &ctx)
     : SyntheticSection(ctx, SHF_ALLOC | SHF_EXECINSTR, SHT_PROGBITS, 16,
                        ".plt") {}
 
-void IBTPltSection::writeTo(Ctx &ctx, uint8_t *buf) {
+void IBTPltSection::writeTo(uint8_t *buf) {
   ctx.target->writeIBTPlt(buf, ctx.in.plt->getNumEntries());
 }
 
-size_t IBTPltSection::getSize(Ctx &ctx) const {
+size_t IBTPltSection::getSize() const {
   // 16 is the header size of .plt.
   return 16 + ctx.in.plt->getNumEntries() * ctx.target->pltEntrySize;
 }
 
-bool IBTPltSection::isNeeded(Ctx &ctx) const {
-  return ctx.in.plt->getNumEntries() > 0;
-}
+bool IBTPltSection::isNeeded() const { return ctx.in.plt->getNumEntries() > 0; }
 
 RelroPaddingSection::RelroPaddingSection(Ctx &ctx)
     : SyntheticSection(ctx, SHF_ALLOC | SHF_WRITE, SHT_NOBITS, 1,
@@ -3242,7 +3233,7 @@ void DebugNamesSection<ELFT>::getNameRelocs(
   }
 }
 
-template <class ELFT> void DebugNamesSection<ELFT>::finalizeContents(Ctx &) {
+template <class ELFT> void DebugNamesSection<ELFT>::finalizeContents() {
   // Get relocations of .debug_names sections.
   auto relocs = std::make_unique<DenseMap<uint32_t, uint32_t>[]>(numChunks);
   parallelFor(0, numChunks, [&](size_t i) {
@@ -3262,8 +3253,7 @@ template <class ELFT> void DebugNamesSection<ELFT>::finalizeContents(Ctx &) {
   });
 }
 
-template <class ELFT>
-void DebugNamesSection<ELFT>::writeTo(Ctx &ctx, uint8_t *buf) {
+template <class ELFT> void DebugNamesSection<ELFT>::writeTo(uint8_t *buf) {
   [[maybe_unused]] const uint8_t *const beginBuf = buf;
   // Write the header.
   endian::writeNext<uint32_t, ELFT::Endianness>(buf, hdr.UnitLength);
@@ -3574,7 +3564,7 @@ std::unique_ptr<GdbIndexSection> GdbIndexSection::create(Ctx &ctx) {
   return ret;
 }
 
-void GdbIndexSection::writeTo(Ctx &ctx, uint8_t *buf) {
+void GdbIndexSection::writeTo(uint8_t *buf) {
   // Write the header.
   auto *hdr = reinterpret_cast<GdbIndexHeader *>(buf);
   uint8_t *start = buf;
@@ -3643,12 +3633,12 @@ void GdbIndexSection::writeTo(Ctx &ctx, uint8_t *buf) {
   }
 }
 
-bool GdbIndexSection::isNeeded(Ctx &) const { return !chunks.empty(); }
+bool GdbIndexSection::isNeeded() const { return !chunks.empty(); }
 
 EhFrameHeader::EhFrameHeader(Ctx &ctx)
     : SyntheticSection(ctx, SHF_ALLOC, SHT_PROGBITS, 4, ".eh_frame_hdr") {}
 
-void EhFrameHeader::writeTo(Ctx &ctx, uint8_t *buf) {
+void EhFrameHeader::writeTo(uint8_t *buf) {
   // Unlike most sections, the EhFrameHeader section is written while writing
   // another section, namely EhFrameSection, which calls the write() function
   // below from its writeTo() function. This is necessary because the contents
@@ -3681,13 +3671,13 @@ void EhFrameHeader::write() {
   }
 }
 
-size_t EhFrameHeader::getSize(Ctx &ctx) const {
+size_t EhFrameHeader::getSize() const {
   // .eh_frame_hdr has a 12 bytes header followed by an array of FDEs.
   return 12 + getPartition().ehFrame->numFdes * 8;
 }
 
-bool EhFrameHeader::isNeeded(Ctx &ctx) const {
-  return isLive() && getPartition().ehFrame->isNeeded(ctx);
+bool EhFrameHeader::isNeeded() const {
+  return isLive() && getPartition().ehFrame->isNeeded();
 }
 
 VersionDefinitionSection::VersionDefinitionSection(Ctx &ctx)
@@ -3702,7 +3692,7 @@ StringRef VersionDefinitionSection::getFileDefName() {
   return ctx.arg.outputFile;
 }
 
-void VersionDefinitionSection::finalizeContents(Ctx &) {
+void VersionDefinitionSection::finalizeContents() {
   fileDefNameOff = getPartition().dynStrTab->addString(getFileDefName());
   for (const VersionDefinition &v : namedVersionDefs(ctx))
     verDefNameOffs.push_back(getPartition().dynStrTab->addString(v.name));
@@ -3734,7 +3724,7 @@ void VersionDefinitionSection::writeOne(uint8_t *buf, uint32_t index,
   write32(buf + 24, 0);       // vda_next
 }
 
-void VersionDefinitionSection::writeTo(Ctx &ctx, uint8_t *buf) {
+void VersionDefinitionSection::writeTo(uint8_t *buf) {
   writeOne(buf, 1, getFileDefName(), fileDefNameOff);
 
   auto nameOffIt = verDefNameOffs.begin();
@@ -3747,7 +3737,7 @@ void VersionDefinitionSection::writeTo(Ctx &ctx, uint8_t *buf) {
   write32(buf + 16, 0); // vd_next
 }
 
-size_t VersionDefinitionSection::getSize(Ctx &ctx) const {
+size_t VersionDefinitionSection::getSize() const {
   return EntrySize * getVerDefNum();
 }
 
@@ -3758,17 +3748,17 @@ VersionTableSection::VersionTableSection(Ctx &ctx)
   this->entsize = 2;
 }
 
-void VersionTableSection::finalizeContents(Ctx &) {
+void VersionTableSection::finalizeContents() {
   // At the moment of june 2016 GNU docs does not mention that sh_link field
   // should be set, but Sun docs do. Also readelf relies on this field.
   getParent()->link = getPartition().dynSymTab->getParent()->sectionIndex;
 }
 
-size_t VersionTableSection::getSize(Ctx &ctx) const {
+size_t VersionTableSection::getSize() const {
   return (getPartition().dynSymTab->getSymbols().size() + 1) * 2;
 }
 
-void VersionTableSection::writeTo(Ctx &ctx, uint8_t *buf) {
+void VersionTableSection::writeTo(uint8_t *buf) {
   buf += 2;
   for (const SymbolTableEntry &s : getPartition().dynSymTab->getSymbols()) {
     // For an unextracted lazy symbol (undefined weak), it must have been
@@ -3779,9 +3769,9 @@ void VersionTableSection::writeTo(Ctx &ctx, uint8_t *buf) {
   }
 }
 
-bool VersionTableSection::isNeeded(Ctx &ctx) const {
+bool VersionTableSection::isNeeded() const {
   return isLive() &&
-         (getPartition().verDef || getPartition().verNeed->isNeeded(ctx));
+         (getPartition().verDef || getPartition().verNeed->isNeeded());
 }
 
 void elf::addVerneed(Symbol *ss) {
@@ -3807,7 +3797,7 @@ VersionNeedSection<ELFT>::VersionNeedSection(Ctx &ctx)
     : SyntheticSection(ctx, SHF_ALLOC, SHT_GNU_verneed, sizeof(uint32_t),
                        ".gnu.version_r") {}
 
-template <class ELFT> void VersionNeedSection<ELFT>::finalizeContents(Ctx &) {
+template <class ELFT> void VersionNeedSection<ELFT>::finalizeContents() {
   for (SharedFile *f : ctx.sharedFiles) {
     if (f->vernauxs.empty())
       continue;
@@ -3840,8 +3830,7 @@ template <class ELFT> void VersionNeedSection<ELFT>::finalizeContents(Ctx &) {
   getParent()->info = verneeds.size();
 }
 
-template <class ELFT>
-void VersionNeedSection<ELFT>::writeTo(Ctx &ctx, uint8_t *buf) {
+template <class ELFT> void VersionNeedSection<ELFT>::writeTo(uint8_t *buf) {
   // The Elf_Verneeds need to appear first, followed by the Elf_Vernauxs.
   auto *verneed = reinterpret_cast<Elf_Verneed *>(buf);
   auto *vernaux = reinterpret_cast<Elf_Vernaux *>(verneed + verneeds.size());
@@ -3871,12 +3860,12 @@ void VersionNeedSection<ELFT>::writeTo(Ctx &ctx, uint8_t *buf) {
   verneed[-1].vn_next = 0;
 }
 
-template <class ELFT> size_t VersionNeedSection<ELFT>::getSize(Ctx &ctx) const {
+template <class ELFT> size_t VersionNeedSection<ELFT>::getSize() const {
   return verneeds.size() * sizeof(Elf_Verneed) +
          SharedFile::vernauxNum * sizeof(Elf_Vernaux);
 }
 
-template <class ELFT> bool VersionNeedSection<ELFT>::isNeeded(Ctx &) const {
+template <class ELFT> bool VersionNeedSection<ELFT>::isNeeded() const {
   return isLive() && SharedFile::vernauxNum != 0;
 }
 
@@ -3892,11 +3881,11 @@ MergeTailSection::MergeTailSection(Ctx &ctx, StringRef name, uint32_t type,
     : MergeSyntheticSection(ctx, name, type, flags, alignment),
       builder(StringTableBuilder::RAW, llvm::Align(alignment)) {}
 
-size_t MergeTailSection::getSize(Ctx &) const { return builder.getSize(); }
+size_t MergeTailSection::getSize() const { return builder.getSize(); }
 
-void MergeTailSection::writeTo(Ctx &, uint8_t *buf) { builder.write(buf); }
+void MergeTailSection::writeTo(uint8_t *buf) { builder.write(buf); }
 
-void MergeTailSection::finalizeContents(Ctx &) {
+void MergeTailSection::finalizeContents() {
   // Add all string pieces to the string table builder to create section
   // contents.
   for (MergeInputSection *sec : sections)
@@ -3916,7 +3905,7 @@ void MergeTailSection::finalizeContents(Ctx &) {
         sec->pieces[i].outputOff = builder.getOffset(sec->getData(i));
 }
 
-void MergeNoTailSection::writeTo(Ctx &ctx, uint8_t *buf) {
+void MergeNoTailSection::writeTo(uint8_t *buf) {
   parallelFor(0, numShards,
               [&](size_t i) { shards[i].write(buf + shardOffsets[i]); });
 }
@@ -3929,7 +3918,7 @@ void MergeNoTailSection::writeTo(Ctx &ctx, uint8_t *buf) {
 // value is different from T's. If that's the case, we can safely put S and
 // T into different string builders without worrying about merge misses.
 // We do it in parallel.
-void MergeNoTailSection::finalizeContents(Ctx &) {
+void MergeNoTailSection::finalizeContents() {
   // Initializes string table builders.
   for (size_t i = 0; i < numShards; ++i)
     shards.emplace_back(StringTableBuilder::RAW, llvm::Align(addralign));
@@ -4111,7 +4100,7 @@ static bool isDuplicateArmExidxSec(InputSection *prev, InputSection *cur) {
 // must be sorted in ascending order of address, Sentinel is set to the
 // InputSection with the highest address and any InputSections that have
 // mergeable .ARM.exidx table entries are removed from it.
-void ARMExidxSyntheticSection::finalizeContents(Ctx &) {
+void ARMExidxSyntheticSection::finalizeContents() {
   // Ensure that any fixed-point iterations after the first see the original set
   // of sections.
   if (!originalExecutableSections.empty())
@@ -4198,7 +4187,7 @@ InputSection *ARMExidxSyntheticSection::getLinkOrderDep() const {
 //     section is to terminate the address range of the previous entry.
 // 3.) A trailing EXIDX_CANTUNWIND sentinel section is required at the end of
 //     the table to terminate the address range of the final entry.
-void ARMExidxSyntheticSection::writeTo(Ctx &ctx, uint8_t *buf) {
+void ARMExidxSyntheticSection::writeTo(uint8_t *buf) {
 
   // A linker generated CANTUNWIND entry is made up of two words:
   // 0x0 with R_ARM_PREL31 relocation to target.
@@ -4235,7 +4224,7 @@ void ARMExidxSyntheticSection::writeTo(Ctx &ctx, uint8_t *buf) {
   assert(size == offset + 8);
 }
 
-bool ARMExidxSyntheticSection::isNeeded(Ctx &) const {
+bool ARMExidxSyntheticSection::isNeeded() const {
   return llvm::any_of(exidxSections,
                       [](InputSection *isec) { return isec->isLive(); });
 }
@@ -4247,7 +4236,7 @@ ThunkSection::ThunkSection(Ctx &ctx, OutputSection *os, uint64_t off)
   this->outSecOff = off;
 }
 
-size_t ThunkSection::getSize(Ctx &) const {
+size_t ThunkSection::getSize() const {
   if (roundUpSizeForErrata)
     return alignTo(size, 4096);
   return size;
@@ -4258,7 +4247,7 @@ void ThunkSection::addThunk(Thunk *t) {
   t->addSymbols(*this);
 }
 
-void ThunkSection::writeTo(Ctx &ctx, uint8_t *buf) {
+void ThunkSection::writeTo(uint8_t *buf) {
   for (Thunk *t : thunks)
     t->writeTo(buf + t->offset);
 }
@@ -4287,7 +4276,7 @@ bool ThunkSection::assignOffsets() {
 PPC32Got2Section::PPC32Got2Section(Ctx &ctx)
     : SyntheticSection(ctx, SHF_ALLOC | SHF_WRITE, SHT_PROGBITS, 4, ".got2") {}
 
-bool PPC32Got2Section::isNeeded(Ctx &) const {
+bool PPC32Got2Section::isNeeded() const {
   // See the comment below. This is not needed if there is no other
   // InputSection.
   for (SectionCommand *cmd : getParent()->commands)
@@ -4298,7 +4287,7 @@ bool PPC32Got2Section::isNeeded(Ctx &) const {
   return false;
 }
 
-void PPC32Got2Section::finalizeContents(Ctx &) {
+void PPC32Got2Section::finalizeContents() {
   // PPC32 may create multiple GOT sections for -fPIC/-fPIE, one per file in
   // .got2 . This function computes outSecOff of each .got2 to be used in
   // PPC32PltCallStub::writeTo(). The purpose of this empty synthetic section is
@@ -4337,11 +4326,11 @@ PPC64LongBranchTargetSection::addEntry(const Symbol *sym, int64_t addend) {
   return res.first->second;
 }
 
-size_t PPC64LongBranchTargetSection::getSize(Ctx &ctx) const {
+size_t PPC64LongBranchTargetSection::getSize() const {
   return entries.size() * 8;
 }
 
-void PPC64LongBranchTargetSection::writeTo(Ctx &ctx, uint8_t *buf) {
+void PPC64LongBranchTargetSection::writeTo(uint8_t *buf) {
   // If linking non-pic we have the final addresses of the targets and they get
   // written to the table directly. For pic the dynamic linker will allocate
   // the section and fill it.
@@ -4360,7 +4349,7 @@ void PPC64LongBranchTargetSection::writeTo(Ctx &ctx, uint8_t *buf) {
   }
 }
 
-bool PPC64LongBranchTargetSection::isNeeded(Ctx &) const {
+bool PPC64LongBranchTargetSection::isNeeded() const {
   // `removeUnusedSyntheticSections()` is called before thunk allocation which
   // is too early to determine if this section will be empty or not. We need
   // Finalized to keep the section alive until after thunk creation. Finalized
@@ -4434,12 +4423,12 @@ PartitionElfHeaderSection<ELFT>::PartitionElfHeaderSection(Ctx &ctx)
     : SyntheticSection(ctx, SHF_ALLOC, SHT_LLVM_PART_EHDR, 1, "") {}
 
 template <typename ELFT>
-size_t PartitionElfHeaderSection<ELFT>::getSize(Ctx &ctx) const {
+size_t PartitionElfHeaderSection<ELFT>::getSize() const {
   return sizeof(typename ELFT::Ehdr);
 }
 
 template <typename ELFT>
-void PartitionElfHeaderSection<ELFT>::writeTo(Ctx &ctx, uint8_t *buf) {
+void PartitionElfHeaderSection<ELFT>::writeTo(uint8_t *buf) {
   writeEhdr<ELFT>(buf, getPartition());
 
   // Loadable partitions are always ET_DYN.
@@ -4452,29 +4441,29 @@ PartitionProgramHeadersSection<ELFT>::PartitionProgramHeadersSection(Ctx &ctx)
     : SyntheticSection(ctx, SHF_ALLOC, SHT_LLVM_PART_PHDR, 1, ".phdrs") {}
 
 template <typename ELFT>
-size_t PartitionProgramHeadersSection<ELFT>::getSize(Ctx &ctx) const {
+size_t PartitionProgramHeadersSection<ELFT>::getSize() const {
   return sizeof(typename ELFT::Phdr) * getPartition().phdrs.size();
 }
 
 template <typename ELFT>
-void PartitionProgramHeadersSection<ELFT>::writeTo(Ctx &ctx, uint8_t *buf) {
+void PartitionProgramHeadersSection<ELFT>::writeTo(uint8_t *buf) {
   writePhdrs<ELFT>(buf, getPartition());
 }
 
 PartitionIndexSection::PartitionIndexSection(Ctx &ctx)
     : SyntheticSection(ctx, SHF_ALLOC, SHT_PROGBITS, 4, ".rodata") {}
 
-size_t PartitionIndexSection::getSize(Ctx &ctx) const {
+size_t PartitionIndexSection::getSize() const {
   return 12 * (ctx.partitions.size() - 1);
 }
 
-void PartitionIndexSection::finalizeContents(Ctx &) {
+void PartitionIndexSection::finalizeContents() {
   for (size_t i = 1; i != ctx.partitions.size(); ++i)
     ctx.partitions[i].nameStrTab =
         ctx.mainPart->dynStrTab->addString(ctx.partitions[i].name);
 }
 
-void PartitionIndexSection::writeTo(Ctx &ctx, uint8_t *buf) {
+void PartitionIndexSection::writeTo(uint8_t *buf) {
   uint64_t va = getVA();
   for (size_t i = 1; i != ctx.partitions.size(); ++i) {
     write32(buf, ctx.mainPart->dynStrTab->getVA() +
@@ -4544,7 +4533,7 @@ bool elf::canHaveMemtagGlobals() {
 }
 
 constexpr char kMemtagAndroidNoteName[] = "Android";
-void MemtagAndroidNote::writeTo(Ctx &ctx, uint8_t *buf) {
+void MemtagAndroidNote::writeTo(uint8_t *buf) {
   static_assert(
       sizeof(kMemtagAndroidNoteName) == 8,
       "Android 11 & 12 have an ABI that the note name is 8 bytes long. Keep it "
@@ -4567,13 +4556,13 @@ void MemtagAndroidNote::writeTo(Ctx &ctx, uint8_t *buf) {
   write32(buf, value); // note value
 }
 
-size_t MemtagAndroidNote::getSize(Ctx &ctx) const {
+size_t MemtagAndroidNote::getSize() const {
   return sizeof(llvm::ELF::Elf64_Nhdr) +
          /*namesz=*/alignTo(sizeof(kMemtagAndroidNoteName), 4) +
          /*descsz=*/sizeof(uint32_t);
 }
 
-void PackageMetadataNote::writeTo(Ctx &ctx, uint8_t *buf) {
+void PackageMetadataNote::writeTo(uint8_t *buf) {
   write32(buf, 4);
   write32(buf + 4, ctx.arg.packageMetadata.size() + 1);
   write32(buf + 8, FDO_PACKAGING_METADATA);
@@ -4582,7 +4571,7 @@ void PackageMetadataNote::writeTo(Ctx &ctx, uint8_t *buf) {
          ctx.arg.packageMetadata.size());
 }
 
-size_t PackageMetadataNote::getSize(Ctx &ctx) const {
+size_t PackageMetadataNote::getSize() const {
   return sizeof(llvm::ELF::Elf64_Nhdr) + 4 +
          alignTo(ctx.arg.packageMetadata.size() + 1, 4);
 }
@@ -4643,19 +4632,19 @@ createMemtagGlobalDescriptors(Ctx &ctx,
 }
 
 bool MemtagGlobalDescriptors::updateAllocSize(Ctx &ctx) {
-  size_t oldSize = getSize(ctx);
+  size_t oldSize = getSize();
   std::stable_sort(symbols.begin(), symbols.end(),
                    [](const Symbol *s1, const Symbol *s2) {
                      return s1->getVA() < s2->getVA();
                    });
-  return oldSize != getSize(ctx);
+  return oldSize != getSize();
 }
 
-void MemtagGlobalDescriptors::writeTo(Ctx &ctx, uint8_t *buf) {
+void MemtagGlobalDescriptors::writeTo(uint8_t *buf) {
   createMemtagGlobalDescriptors(ctx, symbols, buf);
 }
 
-size_t MemtagGlobalDescriptors::getSize(Ctx &ctx) const {
+size_t MemtagGlobalDescriptors::getSize() const {
   return createMemtagGlobalDescriptors(ctx, symbols);
 }
 
@@ -4853,7 +4842,7 @@ template <class ELFT> void elf::createSyntheticSections(Ctx &ctx) {
     ctx.in.partIndex = std::make_unique<PartitionIndexSection>(ctx);
     addOptionalRegular("__part_index_begin", ctx.in.partIndex.get(), 0);
     addOptionalRegular("__part_index_end", ctx.in.partIndex.get(),
-                       ctx.in.partIndex->getSize(ctx));
+                       ctx.in.partIndex->getSize());
     add(*ctx.in.partIndex);
   }
 
diff --git a/lld/ELF/SyntheticSections.h b/lld/ELF/SyntheticSections.h
index 283b2953449e59..421ef760ef4a09 100644
--- a/lld/ELF/SyntheticSections.h
+++ b/lld/ELF/SyntheticSections.h
@@ -51,10 +51,10 @@ struct CieRecord {
 class EhFrameSection final : public SyntheticSection {
 public:
   EhFrameSection(Ctx &);
-  void writeTo(Ctx &, uint8_t *buf) override;
-  void finalizeContents(Ctx &) override;
-  bool isNeeded(Ctx &) const override { return !sections.empty(); }
-  size_t getSize(Ctx &ctx) const override { return size; }
+  void writeTo(uint8_t *buf) override;
+  void finalizeContents() override;
+  bool isNeeded() const override { return !sections.empty(); }
+  size_t getSize() const override { return size; }
 
   static bool classof(const SectionBase *d) {
     return SyntheticSection::classof(d) && d->name == ".eh_frame";
@@ -105,10 +105,10 @@ class EhFrameSection final : public SyntheticSection {
 class GotSection final : public SyntheticSection {
 public:
   GotSection(Ctx &);
-  size_t getSize(Ctx &ctx) const override { return size; }
-  void finalizeContents(Ctx &) override;
-  bool isNeeded(Ctx &) const override;
-  void writeTo(Ctx &, uint8_t *buf) override;
+  size_t getSize() const override { return size; }
+  void finalizeContents() override;
+  bool isNeeded() const override;
+  void writeTo(uint8_t *buf) override;
 
   void addConstant(const Relocation &r);
   void addEntry(const Symbol &sym);
@@ -139,15 +139,15 @@ class GnuStackSection : public SyntheticSection {
   GnuStackSection(Ctx &ctx)
       : SyntheticSection(ctx, 0, llvm::ELF::SHT_PROGBITS, 1,
                          ".note.GNU-stack") {}
-  void writeTo(Ctx &, uint8_t *buf) override {}
-  size_t getSize(Ctx &ctx) const override { return 0; }
+  void writeTo(uint8_t *buf) override {}
+  size_t getSize() const override { return 0; }
 };
 
 class GnuPropertySection final : public SyntheticSection {
 public:
   GnuPropertySection(Ctx &);
-  void writeTo(Ctx &, uint8_t *buf) override;
-  size_t getSize(Ctx &) const override;
+  void writeTo(uint8_t *buf) override;
+  size_t getSize() const override;
 };
 
 // .note.gnu.build-id section.
@@ -158,8 +158,8 @@ class BuildIdSection : public SyntheticSection {
 public:
   const size_t hashSize;
   BuildIdSection(Ctx &);
-  void writeTo(Ctx &, uint8_t *buf) override;
-  size_t getSize(Ctx &ctx) const override { return headerSize + hashSize; }
+  void writeTo(uint8_t *buf) override;
+  size_t getSize() const override { return headerSize + hashSize; }
   void writeBuildId(llvm::ArrayRef<uint8_t> buf);
 
 private:
@@ -173,9 +173,9 @@ class BuildIdSection : public SyntheticSection {
 class BssSection final : public SyntheticSection {
 public:
   BssSection(Ctx &, StringRef name, uint64_t size, uint32_t addralign);
-  void writeTo(Ctx &, uint8_t *) override {}
-  bool isNeeded(Ctx &) const override { return size != 0; }
-  size_t getSize(Ctx &ctx) const override { return size; }
+  void writeTo(uint8_t *) override {}
+  bool isNeeded() const override { return size != 0; }
+  size_t getSize() const override { return size; }
 
   static bool classof(const SectionBase *s) { return s->bss; }
   uint64_t size;
@@ -184,11 +184,11 @@ class BssSection final : public SyntheticSection {
 class MipsGotSection final : public SyntheticSection {
 public:
   MipsGotSection(Ctx &);
-  void writeTo(Ctx &, uint8_t *buf) override;
-  size_t getSize(Ctx &ctx) const override { return size; }
+  void writeTo(uint8_t *buf) override;
+  size_t getSize() const override { return size; }
   bool updateAllocSize(Ctx &) override;
-  void finalizeContents(Ctx &) override;
-  bool isNeeded(Ctx &) const override;
+  void finalizeContents() override;
+  bool isNeeded() const override;
 
   // Join separate GOTs built for each input file to generate
   // primary and optional multiple secondary GOTs.
@@ -362,9 +362,9 @@ class GotPltSection final : public SyntheticSection {
 public:
   GotPltSection(Ctx &);
   void addEntry(Symbol &sym);
-  size_t getSize(Ctx &) const override;
-  void writeTo(Ctx &, uint8_t *buf) override;
-  bool isNeeded(Ctx &) const override;
+  size_t getSize() const override;
+  void writeTo(uint8_t *buf) override;
+  bool isNeeded() const override;
 
   // Flag to force GotPlt to be in output if we have relocations
   // that relies on its address.
@@ -382,9 +382,9 @@ class IgotPltSection final : public SyntheticSection {
 public:
   IgotPltSection(Ctx &);
   void addEntry(Symbol &sym);
-  size_t getSize(Ctx &) const override;
-  void writeTo(Ctx &, uint8_t *buf) override;
-  bool isNeeded(Ctx &) const override { return !entries.empty(); }
+  size_t getSize() const override;
+  void writeTo(uint8_t *buf) override;
+  bool isNeeded() const override { return !entries.empty(); }
 
 private:
   SmallVector<const Symbol *, 0> entries;
@@ -394,8 +394,8 @@ class StringTableSection final : public SyntheticSection {
 public:
   StringTableSection(Ctx &, StringRef name, bool dynamic);
   unsigned addString(StringRef s, bool hashIt = true);
-  void writeTo(Ctx &, uint8_t *buf) override;
-  size_t getSize(Ctx &ctx) const override { return size; }
+  void writeTo(uint8_t *buf) override;
+  size_t getSize() const override { return size; }
   bool isDynamic() const { return dynamic; }
 
 private:
@@ -485,9 +485,9 @@ template <class ELFT> class DynamicSection final : public SyntheticSection {
 
 public:
   DynamicSection(Ctx &);
-  void finalizeContents(Ctx &) override;
-  void writeTo(Ctx &, uint8_t *buf) override;
-  size_t getSize(Ctx &ctx) const override { return size; }
+  void finalizeContents() override;
+  void writeTo(uint8_t *buf) override;
+  size_t getSize() const override { return size; }
 
 private:
   std::vector<std::pair<int32_t, uint64_t>> computeContents();
@@ -538,17 +538,15 @@ class RelocationBaseSection : public SyntheticSection {
       sec.addReloc({expr, addendRelType, offsetInSec, addend, &sym});
     addReloc<shard>({dynType, &sec, offsetInSec, kind, sym, addend, expr});
   }
-  bool isNeeded(Ctx &) const override {
+  bool isNeeded() const override {
     return !relocs.empty() ||
            llvm::any_of(relocsVec, [](auto &v) { return !v.empty(); });
   }
-  size_t getSize(Ctx &ctx) const override {
-    return relocs.size() * this->entsize;
-  }
+  size_t getSize() const override { return relocs.size() * this->entsize; }
   size_t getRelativeRelocCount() const { return numRelativeRelocs; }
   void mergeRels();
   void partitionRels();
-  void finalizeContents(Ctx &) override;
+  void finalizeContents() override;
   static bool classof(const SectionBase *d) {
     return SyntheticSection::classof(d) &&
            (d->type == llvm::ELF::SHT_RELA || d->type == llvm::ELF::SHT_REL ||
@@ -581,7 +579,7 @@ class RelocationSection final : public RelocationBaseSection {
 public:
   RelocationSection(Ctx &, StringRef name, bool combreloc,
                     unsigned concurrency);
-  void writeTo(Ctx &, uint8_t *buf) override;
+  void writeTo(uint8_t *buf) override;
 };
 
 template <class ELFT>
@@ -593,8 +591,8 @@ class AndroidPackedRelocationSection final : public RelocationBaseSection {
   AndroidPackedRelocationSection(Ctx &, StringRef name, unsigned concurrency);
 
   bool updateAllocSize(Ctx &) override;
-  size_t getSize(Ctx &ctx) const override { return relocData.size(); }
-  void writeTo(Ctx &, uint8_t *buf) override {
+  size_t getSize() const override { return relocData.size(); }
+  void writeTo(uint8_t *buf) override {
     memcpy(buf, relocData.data(), relocData.size());
   }
 
@@ -615,7 +613,7 @@ class RelrBaseSection : public SyntheticSection {
 public:
   RelrBaseSection(Ctx &, unsigned concurrency, bool isAArch64Auth = false);
   void mergeRels();
-  bool isNeeded(Ctx &) const override {
+  bool isNeeded() const override {
     return !relocs.empty() ||
            llvm::any_of(relocsVec, [](auto &v) { return !v.empty(); });
   }
@@ -634,11 +632,9 @@ template <class ELFT> class RelrSection final : public RelrBaseSection {
   RelrSection(Ctx &, unsigned concurrency, bool isAArch64Auth = false);
 
   bool updateAllocSize(Ctx &) override;
-  size_t getSize(Ctx &ctx) const override {
-    return relrRelocs.size() * this->entsize;
-  }
-  void writeTo(Ctx &ctx, uint8_t *buf) override {
-    memcpy(buf, relrRelocs.data(), getSize(ctx));
+  size_t getSize() const override { return relrRelocs.size() * this->entsize; }
+  void writeTo(uint8_t *buf) override {
+    memcpy(buf, relrRelocs.data(), getSize());
   }
 
 private:
@@ -653,8 +649,8 @@ struct SymbolTableEntry {
 class SymbolTableBaseSection : public SyntheticSection {
 public:
   SymbolTableBaseSection(Ctx &ctx, StringTableSection &strTabSec);
-  void finalizeContents(Ctx &) override;
-  size_t getSize(Ctx &ctx) const override { return getNumSymbols() * entsize; }
+  void finalizeContents() override;
+  size_t getSize() const override { return getNumSymbols() * entsize; }
   void addSymbol(Symbol *sym);
   unsigned getNumSymbols() const { return symbols.size() + 1; }
   size_t getSymbolIndex(const Symbol &sym);
@@ -679,17 +675,17 @@ class SymbolTableSection final : public SymbolTableBaseSection {
 
 public:
   SymbolTableSection(Ctx &, StringTableSection &strTabSec);
-  void writeTo(Ctx &, uint8_t *buf) override;
+  void writeTo(uint8_t *buf) override;
 };
 
 class SymtabShndxSection final : public SyntheticSection {
 public:
   SymtabShndxSection(Ctx &);
 
-  void writeTo(Ctx &, uint8_t *buf) override;
-  size_t getSize(Ctx &) const override;
-  bool isNeeded(Ctx &) const override;
-  void finalizeContents(Ctx &) override;
+  void writeTo(uint8_t *buf) override;
+  size_t getSize() const override;
+  bool isNeeded() const override;
+  void finalizeContents() override;
 };
 
 // Outputs GNU Hash section. For detailed explanation see:
@@ -697,9 +693,9 @@ class SymtabShndxSection final : public SyntheticSection {
 class GnuHashTableSection final : public SyntheticSection {
 public:
   GnuHashTableSection(Ctx &);
-  void finalizeContents(Ctx &) override;
-  void writeTo(Ctx &, uint8_t *buf) override;
-  size_t getSize(Ctx &ctx) const override { return size; }
+  void finalizeContents() override;
+  void writeTo(uint8_t *buf) override;
+  size_t getSize() const override { return size; }
 
   // Adds symbols to the hash table.
   // Sorts the input to satisfy GNU hash section requirements.
@@ -725,9 +721,9 @@ class GnuHashTableSection final : public SyntheticSection {
 class HashTableSection final : public SyntheticSection {
 public:
   HashTableSection(Ctx &);
-  void finalizeContents(Ctx &) override;
-  void writeTo(Ctx &, uint8_t *buf) override;
-  size_t getSize(Ctx &ctx) const override { return size; }
+  void finalizeContents() override;
+  void writeTo(uint8_t *buf) override;
+  size_t getSize() const override { return size; }
 
 private:
   size_t size = 0;
@@ -747,9 +743,9 @@ class HashTableSection final : public SyntheticSection {
 class PltSection : public SyntheticSection {
 public:
   PltSection(Ctx &);
-  void writeTo(Ctx &, uint8_t *buf) override;
-  size_t getSize(Ctx &) const override;
-  bool isNeeded(Ctx &) const override;
+  void writeTo(uint8_t *buf) override;
+  size_t getSize() const override;
+  bool isNeeded() const override;
   void addSymbols();
   void addEntry(Symbol &sym);
   size_t getNumEntries() const { return entries.size(); }
@@ -768,9 +764,9 @@ class IpltSection final : public SyntheticSection {
 
 public:
   IpltSection(Ctx &);
-  void writeTo(Ctx &, uint8_t *buf) override;
-  size_t getSize(Ctx &) const override;
-  bool isNeeded(Ctx &) const override { return !entries.empty(); }
+  void writeTo(uint8_t *buf) override;
+  size_t getSize() const override;
+  bool isNeeded() const override { return !entries.empty(); }
   void addSymbols();
   void addEntry(Symbol &sym);
 };
@@ -778,8 +774,8 @@ class IpltSection final : public SyntheticSection {
 class PPC32GlinkSection : public PltSection {
 public:
   PPC32GlinkSection(Ctx &);
-  void writeTo(Ctx &, uint8_t *buf) override;
-  size_t getSize(Ctx &) const override;
+  void writeTo(uint8_t *buf) override;
+  size_t getSize() const override;
 
   SmallVector<const Symbol *, 0> canonical_plts;
   static constexpr size_t footerSize = 64;
@@ -789,9 +785,9 @@ class PPC32GlinkSection : public PltSection {
 class IBTPltSection : public SyntheticSection {
 public:
   IBTPltSection(Ctx &);
-  void writeTo(Ctx &, uint8_t *Buf) override;
-  bool isNeeded(Ctx &) const override;
-  size_t getSize(Ctx &) const override;
+  void writeTo(uint8_t *Buf) override;
+  bool isNeeded() const override;
+  size_t getSize() const override;
 };
 
 // Used to align the end of the PT_GNU_RELRO segment and the associated PT_LOAD
@@ -800,8 +796,8 @@ class IBTPltSection : public SyntheticSection {
 class RelroPaddingSection final : public SyntheticSection {
 public:
   RelroPaddingSection(Ctx &);
-  size_t getSize(Ctx &ctx) const override { return 0; }
-  void writeTo(Ctx &, uint8_t *buf) override {}
+  size_t getSize() const override { return 0; }
+  void writeTo(uint8_t *buf) override {}
 };
 
 // Used by the merged DWARF32 .debug_names (a per-module index). If we
@@ -875,8 +871,8 @@ class DebugNamesBaseSection : public SyntheticSection {
   };
 
   DebugNamesBaseSection(Ctx &);
-  size_t getSize(Ctx &ctx) const override { return size; }
-  bool isNeeded(Ctx &) const override { return numChunks > 0; }
+  size_t getSize() const override { return size; }
+  bool isNeeded() const override { return numChunks > 0; }
 
 protected:
   void init(llvm::function_ref<void(InputFile *, InputChunk &, OutputChunk &)>);
@@ -919,8 +915,8 @@ template <class ELFT>
 class DebugNamesSection final : public DebugNamesBaseSection {
 public:
   DebugNamesSection(Ctx &);
-  void finalizeContents(Ctx &) override;
-  void writeTo(Ctx &, uint8_t *buf) override;
+  void finalizeContents() override;
+  void writeTo(uint8_t *buf) override;
 
   template <class RelTy>
   void getNameRelocs(const InputFile &file,
@@ -968,9 +964,9 @@ class GdbIndexSection final : public SyntheticSection {
   GdbIndexSection(Ctx &);
   template <typename ELFT>
   static std::unique_ptr<GdbIndexSection> create(Ctx &);
-  void writeTo(Ctx &, uint8_t *buf) override;
-  size_t getSize(Ctx &ctx) const override { return size; }
-  bool isNeeded(Ctx &) const override;
+  void writeTo(uint8_t *buf) override;
+  size_t getSize() const override { return size; }
+  bool isNeeded() const override;
 
 private:
   struct GdbIndexHeader {
@@ -1007,9 +1003,9 @@ class EhFrameHeader final : public SyntheticSection {
 public:
   EhFrameHeader(Ctx &);
   void write();
-  void writeTo(Ctx &, uint8_t *buf) override;
-  size_t getSize(Ctx &) const override;
-  bool isNeeded(Ctx &) const override;
+  void writeTo(uint8_t *buf) override;
+  size_t getSize() const override;
+  bool isNeeded() const override;
 };
 
 // For more information about .gnu.version and .gnu.version_r see:
@@ -1023,9 +1019,9 @@ class EhFrameHeader final : public SyntheticSection {
 class VersionDefinitionSection final : public SyntheticSection {
 public:
   VersionDefinitionSection(Ctx &);
-  void finalizeContents(Ctx &) override;
-  size_t getSize(Ctx &) const override;
-  void writeTo(Ctx &, uint8_t *buf) override;
+  void finalizeContents() override;
+  size_t getSize() const override;
+  void writeTo(uint8_t *buf) override;
 
 private:
   enum { EntrySize = 28 };
@@ -1045,10 +1041,10 @@ class VersionDefinitionSection final : public SyntheticSection {
 class VersionTableSection final : public SyntheticSection {
 public:
   VersionTableSection(Ctx &);
-  void finalizeContents(Ctx &) override;
-  size_t getSize(Ctx &) const override;
-  void writeTo(Ctx &, uint8_t *buf) override;
-  bool isNeeded(Ctx &) const override;
+  void finalizeContents() override;
+  size_t getSize() const override;
+  void writeTo(uint8_t *buf) override;
+  bool isNeeded() const override;
 };
 
 // The .gnu.version_r section defines the version identifiers used by
@@ -1076,10 +1072,10 @@ class VersionNeedSection final : public SyntheticSection {
 
 public:
   VersionNeedSection(Ctx &);
-  void finalizeContents(Ctx &) override;
-  void writeTo(Ctx &, uint8_t *buf) override;
-  size_t getSize(Ctx &) const override;
-  bool isNeeded(Ctx &) const override;
+  void finalizeContents() override;
+  void writeTo(uint8_t *buf) override;
+  size_t getSize() const override;
+  bool isNeeded() const override;
 };
 
 // MergeSyntheticSection is a class that allows us to put mergeable sections
@@ -1102,9 +1098,9 @@ class MergeTailSection final : public MergeSyntheticSection {
   MergeTailSection(Ctx &ctx, StringRef name, uint32_t type, uint64_t flags,
                    uint32_t addralign);
 
-  size_t getSize(Ctx &) const override;
-  void writeTo(Ctx &, uint8_t *buf) override;
-  void finalizeContents(Ctx &) override;
+  size_t getSize() const override;
+  void writeTo(uint8_t *buf) override;
+  void finalizeContents() override;
 
 private:
   llvm::StringTableBuilder builder;
@@ -1116,9 +1112,9 @@ class MergeNoTailSection final : public MergeSyntheticSection {
                      uint32_t addralign)
       : MergeSyntheticSection(ctx, name, type, flags, addralign) {}
 
-  size_t getSize(Ctx &ctx) const override { return size; }
-  void writeTo(Ctx &, uint8_t *buf) override;
-  void finalizeContents(Ctx &) override;
+  size_t getSize() const override { return size; }
+  void writeTo(uint8_t *buf) override;
+  void finalizeContents() override;
 
 private:
   // We use the most significant bits of a hash as a shard ID.
@@ -1149,8 +1145,8 @@ class MipsAbiFlagsSection final : public SyntheticSection {
   static std::unique_ptr<MipsAbiFlagsSection> create(Ctx &);
 
   MipsAbiFlagsSection(Ctx &, Elf_Mips_ABIFlags flags);
-  size_t getSize(Ctx &ctx) const override { return sizeof(Elf_Mips_ABIFlags); }
-  void writeTo(Ctx &, uint8_t *buf) override;
+  size_t getSize() const override { return sizeof(Elf_Mips_ABIFlags); }
+  void writeTo(uint8_t *buf) override;
 
 private:
   Elf_Mips_ABIFlags flags;
@@ -1165,9 +1161,9 @@ template <class ELFT> class MipsOptionsSection final : public SyntheticSection {
   static std::unique_ptr<MipsOptionsSection<ELFT>> create(Ctx &);
 
   MipsOptionsSection(Ctx &, Elf_Mips_RegInfo reginfo);
-  void writeTo(Ctx &, uint8_t *buf) override;
+  void writeTo(uint8_t *buf) override;
 
-  size_t getSize(Ctx &ctx) const override {
+  size_t getSize() const override {
     return sizeof(Elf_Mips_Options) + sizeof(Elf_Mips_RegInfo);
   }
 
@@ -1183,8 +1179,8 @@ template <class ELFT> class MipsReginfoSection final : public SyntheticSection {
   static std::unique_ptr<MipsReginfoSection> create(Ctx &);
 
   MipsReginfoSection(Ctx &, Elf_Mips_RegInfo reginfo);
-  size_t getSize(Ctx &ctx) const override { return sizeof(Elf_Mips_RegInfo); }
-  void writeTo(Ctx &, uint8_t *buf) override;
+  size_t getSize() const override { return sizeof(Elf_Mips_RegInfo); }
+  void writeTo(uint8_t *buf) override;
 
 private:
   Elf_Mips_RegInfo reginfo;
@@ -1197,8 +1193,8 @@ template <class ELFT> class MipsReginfoSection final : public SyntheticSection {
 class MipsRldMapSection final : public SyntheticSection {
 public:
   MipsRldMapSection(Ctx &);
-  size_t getSize(Ctx &ctx) const override { return ctx.arg.wordsize; }
-  void writeTo(Ctx &, uint8_t *buf) override {}
+  size_t getSize() const override { return ctx.arg.wordsize; }
+  void writeTo(uint8_t *buf) override {}
 };
 
 // Representation of the combined .ARM.Exidx input sections. We process these
@@ -1243,11 +1239,11 @@ class ARMExidxSyntheticSection : public SyntheticSection {
   // section needs to be removed from the main input section list.
   bool addSection(InputSection *isec);
 
-  size_t getSize(Ctx &ctx) const override { return size; }
-  void writeTo(Ctx &, uint8_t *buf) override;
-  bool isNeeded(Ctx &) const override;
+  size_t getSize() const override { return size; }
+  void writeTo(uint8_t *buf) override;
+  bool isNeeded() const override;
   // Sort and remove duplicate entries.
-  void finalizeContents(Ctx &) override;
+  void finalizeContents() override;
   InputSection *getLinkOrderDep() const;
 
   static bool classof(const SectionBase *sec) {
@@ -1291,8 +1287,8 @@ class ThunkSection final : public SyntheticSection {
   // Thunk defines a symbol in this InputSection that can be used as target
   // of a relocation
   void addThunk(Thunk *t);
-  size_t getSize(Ctx &) const override;
-  void writeTo(Ctx &, uint8_t *buf) override;
+  size_t getSize() const override;
+  void writeTo(uint8_t *buf) override;
   InputSection *getTargetInputSection() const;
   bool assignOffsets();
 
@@ -1315,17 +1311,16 @@ class ArmCmseSGVeneer;
 class ArmCmseSGSection final : public SyntheticSection {
 public:
   ArmCmseSGSection(Ctx &ctx);
-  bool isNeeded(Ctx &) const override { return !entries.empty(); }
-  size_t getSize(Ctx &) const override;
-  void writeTo(Ctx &, uint8_t *buf) override;
+  bool isNeeded() const override { return !entries.empty(); }
+  size_t getSize() const override;
+  void writeTo(uint8_t *buf) override;
   void addSGVeneer(Symbol *sym, Symbol *ext_sym);
   void addMappingSymbol();
-  void finalizeContents(Ctx &) override;
+  void finalizeContents() override;
   void exportEntries(SymbolTableBaseSection *symTab);
   uint64_t impLibMaxAddr = 0;
 
 private:
-  Ctx &ctx;
   SmallVector<std::pair<Symbol *, Symbol *>, 0> entries;
   SmallVector<ArmCmseSGVeneer *, 0> sgVeneers;
   uint64_t newEntries = 0;
@@ -1336,10 +1331,10 @@ class ArmCmseSGSection final : public SyntheticSection {
 class PPC32Got2Section final : public SyntheticSection {
 public:
   PPC32Got2Section(Ctx &);
-  size_t getSize(Ctx &ctx) const override { return 0; }
-  bool isNeeded(Ctx &) const override;
-  void finalizeContents(Ctx &) override;
-  void writeTo(Ctx &, uint8_t *buf) override {}
+  size_t getSize() const override { return 0; }
+  bool isNeeded() const override;
+  void finalizeContents() override;
+  void writeTo(uint8_t *buf) override {}
 };
 
 // This section is used to store the addresses of functions that are called
@@ -1352,10 +1347,10 @@ class PPC64LongBranchTargetSection final : public SyntheticSection {
   PPC64LongBranchTargetSection(Ctx &);
   uint64_t getEntryVA(const Symbol *sym, int64_t addend);
   std::optional<uint32_t> addEntry(const Symbol *sym, int64_t addend);
-  size_t getSize(Ctx &) const override;
-  void writeTo(Ctx &, uint8_t *buf) override;
-  bool isNeeded(Ctx &) const override;
-  void finalizeContents(Ctx &) override { finalized = true; }
+  size_t getSize() const override;
+  void writeTo(uint8_t *buf) override;
+  bool isNeeded() const override;
+  void finalizeContents() override { finalized = true; }
 
 private:
   SmallVector<std::pair<const Symbol *, int64_t>, 0> entries;
@@ -1367,24 +1362,24 @@ template <typename ELFT>
 class PartitionElfHeaderSection final : public SyntheticSection {
 public:
   PartitionElfHeaderSection(Ctx &);
-  size_t getSize(Ctx &) const override;
-  void writeTo(Ctx &, uint8_t *buf) override;
+  size_t getSize() const override;
+  void writeTo(uint8_t *buf) override;
 };
 
 template <typename ELFT>
 class PartitionProgramHeadersSection final : public SyntheticSection {
 public:
   PartitionProgramHeadersSection(Ctx &);
-  size_t getSize(Ctx &) const override;
-  void writeTo(Ctx &, uint8_t *buf) override;
+  size_t getSize() const override;
+  void writeTo(uint8_t *buf) override;
 };
 
 class PartitionIndexSection final : public SyntheticSection {
 public:
   PartitionIndexSection(Ctx &);
-  size_t getSize(Ctx &) const override;
-  void finalizeContents(Ctx &) override;
-  void writeTo(Ctx &, uint8_t *buf) override;
+  size_t getSize() const override;
+  void finalizeContents() override;
+  void writeTo(uint8_t *buf) override;
 };
 
 // See the following link for the Android-specific loader code that operates on
@@ -1395,8 +1390,8 @@ class MemtagAndroidNote final : public SyntheticSection {
   MemtagAndroidNote(Ctx &ctx)
       : SyntheticSection(ctx, llvm::ELF::SHF_ALLOC, llvm::ELF::SHT_NOTE,
                          /*alignment=*/4, ".note.android.memtag") {}
-  void writeTo(Ctx &, uint8_t *buf) override;
-  size_t getSize(Ctx &) const override;
+  void writeTo(uint8_t *buf) override;
+  size_t getSize() const override;
 };
 
 class PackageMetadataNote final : public SyntheticSection {
@@ -1404,8 +1399,8 @@ class PackageMetadataNote final : public SyntheticSection {
   PackageMetadataNote(Ctx &ctx)
       : SyntheticSection(ctx, llvm::ELF::SHF_ALLOC, llvm::ELF::SHT_NOTE,
                          /*alignment=*/4, ".note.package") {}
-  void writeTo(Ctx &, uint8_t *buf) override;
-  size_t getSize(Ctx &) const override;
+  void writeTo(uint8_t *buf) override;
+  size_t getSize() const override;
 };
 
 class MemtagGlobalDescriptors final : public SyntheticSection {
@@ -1414,19 +1409,19 @@ class MemtagGlobalDescriptors final : public SyntheticSection {
       : SyntheticSection(ctx, llvm::ELF::SHF_ALLOC,
                          llvm::ELF::SHT_AARCH64_MEMTAG_GLOBALS_DYNAMIC,
                          /*alignment=*/4, ".memtag.globals.dynamic") {}
-  void writeTo(Ctx &, uint8_t *buf) override;
+  void writeTo(uint8_t *buf) override;
   // The size of the section is non-computable until all addresses are
   // synthetized, because the section's contents contain a sorted
   // varint-compressed list of pointers to global variables. We only know the
   // final size after `finalizeAddressDependentContent()`.
-  size_t getSize(Ctx &) const override;
+  size_t getSize() const override;
   bool updateAllocSize(Ctx &) override;
 
   void addSymbol(const Symbol &sym) {
     symbols.push_back(&sym);
   }
 
-  bool isNeeded(Ctx &) const override { return !symbols.empty(); }
+  bool isNeeded() const override { return !symbols.empty(); }
 
 private:
   SmallVector<const Symbol *, 0> symbols;
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index f4a22ea953ec49..f9a21b6745fdd1 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -837,10 +837,10 @@ template <class ELFT> void Writer<ELFT>::setReservedSymbolSections() {
   }
 
   // .rela_iplt_{start,end} mark the start and the end of .rel[a].dyn.
-  if (ctx.sym.relaIpltStart && ctx.mainPart->relaDyn->isNeeded(ctx)) {
+  if (ctx.sym.relaIpltStart && ctx.mainPart->relaDyn->isNeeded()) {
     ctx.sym.relaIpltStart->section = ctx.mainPart->relaDyn.get();
     ctx.sym.relaIpltEnd->section = ctx.mainPart->relaDyn.get();
-    ctx.sym.relaIpltEnd->value = ctx.mainPart->relaDyn->getSize(ctx);
+    ctx.sym.relaIpltEnd->value = ctx.mainPart->relaDyn->getSize();
   }
 
   PhdrEntry *last = nullptr;
@@ -1425,9 +1425,9 @@ template <class ELFT> void Writer<ELFT>::resolveShfLinkOrder() {
 }
 
 static void finalizeSynthetic(Ctx &ctx, SyntheticSection *sec) {
-  if (sec && sec->isNeeded(ctx) && sec->getParent()) {
+  if (sec && sec->isNeeded() && sec->getParent()) {
     llvm::TimeTraceScope timeScope("Finalize synthetic sections", sec->name);
-    sec->finalizeContents(ctx);
+    sec->finalizeContents();
   }
 }
 
@@ -1679,7 +1679,7 @@ static void removeUnusedSyntheticSections(Ctx &ctx) {
   auto end =
       std::remove_if(start, ctx.inputSections.end(), [&](InputSectionBase *s) {
         auto *sec = cast<SyntheticSection>(s);
-        if (sec->getParent() && sec->isNeeded(ctx))
+        if (sec->getParent() && sec->isNeeded())
           return false;
         // .relr.auth.dyn relocations may be moved to .rela.dyn in
         // finalizeAddressDependentContent, making .rela.dyn no longer empty.
@@ -1810,9 +1810,9 @@ template <class ELFT> void Writer<ELFT>::finalizeSections() {
     reportUndefinedSymbols(ctx);
     postScanRelocations(ctx);
 
-    if (ctx.in.plt && ctx.in.plt->isNeeded(ctx))
+    if (ctx.in.plt && ctx.in.plt->isNeeded())
       ctx.in.plt->addSymbols();
-    if (ctx.in.iplt && ctx.in.iplt->isNeeded(ctx))
+    if (ctx.in.iplt && ctx.in.iplt->isNeeded())
       ctx.in.iplt->addSymbols();
 
     if (ctx.arg.unresolvedSymbolsInShlib != UnresolvedPolicy::Ignore) {
@@ -2312,7 +2312,7 @@ SmallVector<PhdrEntry *, 0> Writer<ELFT>::createPhdrs(Partition &part) {
     ret.push_back(relRo);
 
   // PT_GNU_EH_FRAME is a special section pointing on .eh_frame_hdr.
-  if (part.ehFrame->isNeeded(ctx) && part.ehFrameHdr &&
+  if (part.ehFrame->isNeeded() && part.ehFrameHdr &&
       part.ehFrame->getParent() && part.ehFrameHdr->getParent())
     addHdr(PT_GNU_EH_FRAME, part.ehFrameHdr->getParent()->getPhdrFlags())
         ->add(part.ehFrameHdr->getParent());
@@ -2574,7 +2574,7 @@ template <class ELFT> void Writer<ELFT>::setPhdrs(Partition &part) {
     // output section. We always want to describe just the
     // SyntheticSection.
     if (part.armExidx && p->p_type == PT_ARM_EXIDX) {
-      p->p_filesz = part.armExidx->getSize(ctx);
+      p->p_filesz = part.armExidx->getSize();
       p->p_memsz = p->p_filesz;
       p->p_offset = first->offset + part.armExidx->outSecOff;
       p->p_vaddr = first->addr + part.armExidx->outSecOff;

From 173c68239d1d11f4e36c8af07a28310da67568a7 Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter@amd.com>
Date: Fri, 11 Oct 2024 08:50:49 +0200
Subject: [PATCH 133/177] [AMDGPU] Enable unaligned scratch accesses (#110219)

This allows us to emit wide generic and scratch memory accesses when we
do not have alignment information. In cases where accesses happen to be
properly aligned or where generic accesses do not go to scratch memory,
this improves performance of the generated code by a factor of up to 16x
and reduces code size, especially when lowering memcpy and memmove
intrinsics.

Also: Make the use of the FeatureUnalignedScratchAccess feature more
consistent: FeatureUnalignedScratchAccess and EnableFlatScratch are now
orthogonal, whereas, before, code assumed that the latter implies the
former at some places.

Part of SWDEV-455845.
---
 llvm/lib/Target/AMDGPU/AMDGPU.td              |   24 +-
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      |    4 +-
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |    4 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   16 +-
 .../CodeGen/AMDGPU/GlobalISel/flat-scratch.ll | 1037 ++-
 .../AMDGPU/GlobalISel/legalize-load-flat.mir  | 3222 +-------
 .../GlobalISel/legalize-load-private.mir      | 5246 +++++++------
 llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll    |   28 +-
 .../test/CodeGen/AMDGPU/flat-address-space.ll |   12 +-
 .../CodeGen/AMDGPU/memcpy-crash-issue63986.ll |   98 +-
 llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll    | 2438 +-----
 .../AMDGPU/memcpy-param-combinations.ll       | 6516 +++--------------
 .../AMDGPU/memmove-param-combinations.ll      | 5196 ++-----------
 llvm/test/CodeGen/AMDGPU/sdwa-commute.ll      |    4 +-
 .../CodeGen/AMDGPU/unaligned-load-store.ll    |   28 +-
 15 files changed, 6082 insertions(+), 17791 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 25117544d6a849..62fac085897ab6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1178,9 +1178,9 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
    FeatureAddNoCarryInsts, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts,
    FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16,
    FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK,
-   FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
-   FeatureNegativeScratchOffsetBug, FeatureGWS, FeatureDefaultComponentZero,
-   FeatureVmemWriteVgprInOrder
+   FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess,
+   FeatureUnalignedDSAccess, FeatureNegativeScratchOffsetBug, FeatureGWS,
+   FeatureDefaultComponentZero,FeatureVmemWriteVgprInOrder
   ]
 >;
 
@@ -1199,9 +1199,9 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
    FeatureVOP3Literal, FeatureDPP8, FeatureExtendedImageInsts,
    FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
    FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureG16,
-   FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureImageInsts,
-   FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
-   FeatureMaxHardClauseLength63,
+   FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess,
+   FeatureUnalignedDSAccess, FeatureImageInsts, FeatureGDS, FeatureGWS,
+   FeatureDefaultComponentZero, FeatureMaxHardClauseLength63,
    FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts,
    FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts,
    FeatureVmemWriteVgprInOrder
@@ -1223,9 +1223,9 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
    FeatureVOP3Literal, FeatureDPP8, FeatureExtendedImageInsts,
    FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
    FeatureA16, FeatureFastDenormalF32, FeatureG16,
-   FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureGDS,
-   FeatureGWS, FeatureDefaultComponentZero,
-   FeatureMaxHardClauseLength32,
+   FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess,
+   FeatureUnalignedDSAccess, FeatureGDS, FeatureGWS,
+   FeatureDefaultComponentZero, FeatureMaxHardClauseLength32,
    FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts,
    FeatureVmemWriteVgprInOrder
   ]
@@ -1246,9 +1246,9 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12",
    FeatureVOP3Literal, FeatureDPP8,
    FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
    FeatureA16, FeatureFastDenormalF32, FeatureG16,
-   FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
-   FeatureTrue16BitInsts, FeatureDefaultComponentBroadcast,
-   FeatureMaxHardClauseLength32,
+   FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess,
+   FeatureUnalignedDSAccess, FeatureTrue16BitInsts,
+   FeatureDefaultComponentBroadcast, FeatureMaxHardClauseLength32,
    FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts,
    FeatureAgentScopeFineGrainedRemoteMemoryAtomics
   ]
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 3f4f42377d56ee..d701bf037fdfa6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -387,8 +387,8 @@ bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
   // them later if they may access private memory. We don't have enough context
   // here, and legalization can handle it.
   if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
-    return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
-      ChainSizeInBytes <= ST->getMaxPrivateElementSize();
+    return (Alignment >= 4 || ST->hasUnalignedScratchAccessEnabled()) &&
+           ChainSizeInBytes <= ST->getMaxPrivateElementSize();
   }
   return true;
 }
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 19458126093167..1ea3beb2855d69 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -591,6 +591,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return UnalignedScratchAccess;
   }
 
+  bool hasUnalignedScratchAccessEnabled() const {
+    return UnalignedScratchAccess && UnalignedAccessMode;
+  }
+
   bool hasUnalignedAccessMode() const {
     return UnalignedAccessMode;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 3d8e03521e2b90..8c197f23149612 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1824,26 +1824,16 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
            Subtarget->hasUnalignedDSAccessEnabled();
   }
 
-  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
-    bool AlignedBy4 = Alignment >= Align(4);
-    if (IsFast)
-      *IsFast = AlignedBy4;
-
-    return AlignedBy4 ||
-           Subtarget->enableFlatScratch() ||
-           Subtarget->hasUnalignedScratchAccess();
-  }
-
   // FIXME: We have to be conservative here and assume that flat operations
   // will access scratch.  If we had access to the IR function, then we
   // could determine if any private memory was used in the function.
-  if (AddrSpace == AMDGPUAS::FLAT_ADDRESS &&
-      !Subtarget->hasUnalignedScratchAccess()) {
+  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
+      AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
     bool AlignedBy4 = Alignment >= Align(4);
     if (IsFast)
       *IsFast = AlignedBy4;
 
-    return AlignedBy4;
+    return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled();
   }
 
   // So long as they are correct, wide global memory operations perform better
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index ce528467cd35b4..6e2e88f22600a8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -2428,11 +2428,54 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
 ; UNALIGNED_GFX9-LABEL: store_load_i64_unaligned:
 ; UNALIGNED_GFX9:       ; %bb.0: ; %bb
 ; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v1, 15
-; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; UNALIGNED_GFX9-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v4, 15
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v1, 4, v0
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v2, 2, v0
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v0, v4, off
 ; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
-; UNALIGNED_GFX9-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v6, 6, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v3, v4, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v5, 3, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v2, v4, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v5, v4, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v7, 5, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v1, v4, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v7, v4, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v8, 7, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v6, v4, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v8, v4, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v4, v0, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr6
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr1
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr3
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr5
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr8
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr7
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr2
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr0
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v4, v3, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v4, v2, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v4, v5, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v4, v1, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v4, v7, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v4, v6, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v4, v8, off glc
 ; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; UNALIGNED_GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2441,30 +2484,143 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
 ; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v1, 15
 ; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v2, 0
-; UNALIGNED_GFX10-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v3, 4, v0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v5, 2, v0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v0, v1, off
 ; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; UNALIGNED_GFX10-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v1, 3, v0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v6, 5, v0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v7, 6, v0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v8, 7, v0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v4, v2, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v5, v2, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v1, v2, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v3, v2, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v6, v2, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v7, v2, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v8, v2, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v0, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v4, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v5, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v1, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v3, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v6, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v7, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v8, off glc dlc
 ; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; UNALIGNED_GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; UNALIGNED_GFX940-LABEL: store_load_i64_unaligned:
 ; UNALIGNED_GFX940:       ; %bb.0: ; %bb
 ; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; UNALIGNED_GFX940-NEXT:    v_mov_b64_e32 v[2:3], 15
-; UNALIGNED_GFX940-NEXT:    scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v4, 15
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v1, 4, v0
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v2, 2, v0
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v3, 1, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v0, v4, off sc0 sc1
 ; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
-; UNALIGNED_GFX940-NEXT:    scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v4, 0
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v6, 6, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v3, v4, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v5, 3, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v2, v4, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v5, v4, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v7, 5, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v1, v4, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v7, v4, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v8, 7, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v6, v4, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v8, v4, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v4, v0, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr6
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr1
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr3
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr5
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr8
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr7
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr2
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr0
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v4, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v4, v2, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v4, v5, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v4, v1, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v4, v7, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v4, v6, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v4, v8, off sc0 sc1
 ; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; UNALIGNED_GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; UNALIGNED_GFX11-LABEL: store_load_i64_unaligned:
 ; UNALIGNED_GFX11:       ; %bb.0: ; %bb
 ; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; UNALIGNED_GFX11-NEXT:    v_mov_b32_e32 v1, 15
-; UNALIGNED_GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; UNALIGNED_GFX11-NEXT:    scratch_store_b64 v0, v[1:2], off dlc
+; UNALIGNED_GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v4, 1, v0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v3, 4, v0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v5, 2, v0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v0, v1, off dlc
 ; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; UNALIGNED_GFX11-NEXT:    scratch_load_b64 v[0:1], v0, off glc dlc
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v6, 5, v0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v7, 6, v0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v8, 7, v0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v4, v2, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v5, v2, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v1, v2, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v3, v2, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v6, v2, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v7, v2, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v8, v2, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v0, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v4, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v5, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v1, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v3, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v6, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v7, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v8, off glc dlc
 ; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; UNALIGNED_GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2475,12 +2631,39 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
 ; UNALIGNED_GFX12-NEXT:    s_wait_samplecnt 0x0
 ; UNALIGNED_GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; UNALIGNED_GFX12-NEXT:    s_wait_kmcnt 0x0
-; UNALIGNED_GFX12-NEXT:    v_mov_b32_e32 v1, 15
-; UNALIGNED_GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; UNALIGNED_GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
 ; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
-; UNALIGNED_GFX12-NEXT:    scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v1, off scope:SCOPE_SYS
 ; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
-; UNALIGNED_GFX12-NEXT:    scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:1 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:3 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:4 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:5 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:6 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:7 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:1 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:2 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:3 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:4 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:5 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:6 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v0, v0, off offset:7 scope:SCOPE_SYS
 ; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
 ; UNALIGNED_GFX12-NEXT:    s_setpc_b64 s[30:31]
 bb:
@@ -2572,59 +2755,293 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; UNALIGNED_GFX9-LABEL: store_load_v3i32_unaligned:
 ; UNALIGNED_GFX9:       ; %bb.0: ; %bb
 ; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; UNALIGNED_GFX9-NEXT:    s_mov_b32 s2, 3
-; UNALIGNED_GFX9-NEXT:    s_mov_b32 s1, 2
-; UNALIGNED_GFX9-NEXT:    s_mov_b32 s0, 1
-; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v3, s2
-; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; UNALIGNED_GFX9-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v3, 1
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v1, 2
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v2, 2, v0
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v0, v3, off
 ; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
-; UNALIGNED_GFX9-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v6, 4, v0
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v7, 6, v0
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v9, 8, v0
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v10, 10, v0
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v12, 3
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v4, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v5, 3, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v2, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v5, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v8, 5, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v6, v1, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v8, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v1, 7, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v7, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v1, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v11, 9, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v9, v12, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v11, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v12, 11, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v10, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v12, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v0, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr5
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr9
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr1
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr8
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr2
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr12
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr4
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr11
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr7
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr6
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr10
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr0
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v4, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v2, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v5, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v6, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v8, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v7, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v1, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v9, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v11, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v10, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v12, off glc
 ; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; UNALIGNED_GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; UNALIGNED_GFX10-LABEL: store_load_v3i32_unaligned:
 ; UNALIGNED_GFX10:       ; %bb.0: ; %bb
 ; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; UNALIGNED_GFX10-NEXT:    s_mov_b32 s2, 3
-; UNALIGNED_GFX10-NEXT:    s_mov_b32 s1, 2
-; UNALIGNED_GFX10-NEXT:    s_mov_b32 s0, 1
-; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v3, s2
-; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v2, s1
-; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; UNALIGNED_GFX10-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v1, 1
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v0
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v2, 2
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v4, 2, v0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v0, v1, off
 ; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; UNALIGNED_GFX10-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc dlc
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v1, 3, v0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v6, 4, v0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v7, 5, v0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v5, v3, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v4, v3, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v8, 6, v0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v1, v3, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v6, v2, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v7, v3, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v2, 7, v0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v9, 8, v0
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v10, 3
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v11, 9, v0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v12, 10, v0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v13, 11, v0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v8, v3, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v2, v3, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v9, v10, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v11, v3, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v12, v3, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v13, v3, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v0, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v5, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v4, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v1, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v6, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v7, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v8, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v2, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v9, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v11, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v12, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v13, off glc dlc
 ; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; UNALIGNED_GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; UNALIGNED_GFX940-LABEL: store_load_v3i32_unaligned:
 ; UNALIGNED_GFX940:       ; %bb.0: ; %bb
 ; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; UNALIGNED_GFX940-NEXT:    s_mov_b32 s2, 3
-; UNALIGNED_GFX940-NEXT:    s_mov_b32 s1, 2
-; UNALIGNED_GFX940-NEXT:    s_mov_b32 s0, 1
-; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v4, s2
-; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v3, s1
-; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v2, s0
-; UNALIGNED_GFX940-NEXT:    scratch_store_dwordx3 v0, v[2:4], off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v3, 1
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v1, 2
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v2, 2, v0
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v4, 1, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v0, v3, off sc0 sc1
 ; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
-; UNALIGNED_GFX940-NEXT:    scratch_load_dwordx3 v[0:2], v0, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v3, 0
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v6, 4, v0
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v7, 6, v0
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v9, 8, v0
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v10, 10, v0
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v12, 3
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v4, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v5, 3, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v2, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v5, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v8, 5, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v6, v1, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v8, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v1, 7, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v7, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v1, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v11, 9, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v9, v12, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v11, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v12, 11, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v10, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v12, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v0, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr5
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr9
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr1
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr8
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr2
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr12
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr4
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr11
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr7
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr6
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr10
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr0
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v4, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v2, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v5, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v6, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v8, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v7, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v1, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v9, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v11, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v10, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v12, off sc0 sc1
 ; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; UNALIGNED_GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; UNALIGNED_GFX11-LABEL: store_load_v3i32_unaligned:
 ; UNALIGNED_GFX11:       ; %bb.0: ; %bb
 ; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; UNALIGNED_GFX11-NEXT:    s_mov_b32 s2, 3
-; UNALIGNED_GFX11-NEXT:    s_mov_b32 s1, 2
-; UNALIGNED_GFX11-NEXT:    s_mov_b32 s0, 1
-; UNALIGNED_GFX11-NEXT:    v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v2, s1
-; UNALIGNED_GFX11-NEXT:    v_mov_b32_e32 v1, s0
-; UNALIGNED_GFX11-NEXT:    scratch_store_b96 v0, v[1:3], off dlc
+; UNALIGNED_GFX11-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
+; UNALIGNED_GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_add_nc_u32 v4, 2, v0
+; UNALIGNED_GFX11-NEXT:    v_dual_mov_b32 v10, 3 :: v_dual_add_nc_u32 v5, 1, v0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v0, v1, off dlc
 ; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; UNALIGNED_GFX11-NEXT:    scratch_load_b96 v[0:2], v0, off glc dlc
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v6, 4, v0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v7, 5, v0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v5, v3, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v4, v3, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v8, 6, v0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v1, v3, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v6, v2, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v7, v3, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v2, 7, v0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v9, 8, v0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v11, 9, v0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v12, 10, v0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v13, 11, v0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v8, v3, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v2, v3, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v9, v10, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v11, v3, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v12, v3, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v13, v3, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v0, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v5, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v4, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v1, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v6, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v7, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v8, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v2, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v9, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v11, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v12, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v13, off glc dlc
 ; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; UNALIGNED_GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2635,16 +3052,57 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; UNALIGNED_GFX12-NEXT:    s_wait_samplecnt 0x0
 ; UNALIGNED_GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; UNALIGNED_GFX12-NEXT:    s_wait_kmcnt 0x0
-; UNALIGNED_GFX12-NEXT:    s_mov_b32 s2, 3
-; UNALIGNED_GFX12-NEXT:    s_mov_b32 s1, 2
-; UNALIGNED_GFX12-NEXT:    s_mov_b32 s0, 1
-; UNALIGNED_GFX12-NEXT:    s_wait_alu 0xfffe
-; UNALIGNED_GFX12-NEXT:    v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v2, s1
-; UNALIGNED_GFX12-NEXT:    v_mov_b32_e32 v1, s0
+; UNALIGNED_GFX12-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 0
+; UNALIGNED_GFX12-NEXT:    v_mov_b32_e32 v3, 2
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v1, off scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:1 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:3 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    v_mov_b32_e32 v1, 3
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:5 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:6 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:7 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v1, off offset:8 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:9 scope:SCOPE_SYS
 ; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
-; UNALIGNED_GFX12-NEXT:    scratch_store_b96 v0, v[1:3], off scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:10 scope:SCOPE_SYS
 ; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
-; UNALIGNED_GFX12-NEXT:    scratch_load_b96 v[0:2], v0, off scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:11 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:1 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:2 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:3 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:4 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:5 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:6 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:7 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:8 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:9 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:10 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v0, v0, off offset:11 scope:SCOPE_SYS
 ; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
 ; UNALIGNED_GFX12-NEXT:    s_setpc_b64 s[30:31]
 bb:
@@ -2742,64 +3200,382 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; UNALIGNED_GFX9-LABEL: store_load_v4i32_unaligned:
 ; UNALIGNED_GFX9:       ; %bb.0: ; %bb
 ; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; UNALIGNED_GFX9-NEXT:    s_mov_b32 s3, 4
-; UNALIGNED_GFX9-NEXT:    s_mov_b32 s2, 3
-; UNALIGNED_GFX9-NEXT:    s_mov_b32 s1, 2
-; UNALIGNED_GFX9-NEXT:    s_mov_b32 s0, 1
-; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v4, s3
-; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v3, s2
-; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; UNALIGNED_GFX9-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
-; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
-; UNALIGNED_GFX9-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v3, 1
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v1, 2
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v2, 2, v0
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v0, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v6, 4
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v7, 4, v0
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v8, 6, v0
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v10, 8, v0
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v11, 10, v0
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v13, 3
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v14, 12, v0
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v15, 14, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v4, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v5, 3, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v2, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v5, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v9, 5, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v7, v1, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v9, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v1, 7, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v8, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v1, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v12, 9, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v10, v13, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v12, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v13, 11, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v11, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v13, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v16, 13, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v14, v6, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v16, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v6, 15, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v15, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v6, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v0, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v4, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v2, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v5, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v7, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v9, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v8, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v1, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v10, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v12, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v11, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v13, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v14, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v16, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v15, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr11
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr4
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr15
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr10
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr7
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr13
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr5
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr14
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr12
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr8
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr2
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr1
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr9
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr16
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr0
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v0, v6, off glc
 ; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; UNALIGNED_GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; UNALIGNED_GFX10-LABEL: store_load_v4i32_unaligned:
 ; UNALIGNED_GFX10:       ; %bb.0: ; %bb
 ; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; UNALIGNED_GFX10-NEXT:    s_mov_b32 s3, 4
-; UNALIGNED_GFX10-NEXT:    s_mov_b32 s2, 3
-; UNALIGNED_GFX10-NEXT:    s_mov_b32 s1, 2
-; UNALIGNED_GFX10-NEXT:    s_mov_b32 s0, 1
-; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v4, s3
-; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v3, s2
-; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v2, s1
-; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; UNALIGNED_GFX10-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v1, 1
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v2, 2
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v6, 4, v0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v0, v1, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v1, 3, v0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v5, 2, v0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v7, 5, v0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v4, v3, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v5, v3, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v1, v3, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v9, 6, v0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v6, v2, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v7, v3, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v2, 7, v0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v10, 8, v0
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v11, 3
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v12, 9, v0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v9, v3, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v13, 10, v0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v2, v3, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v10, v11, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v12, v3, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v11, 11, v0
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v8, 4
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v14, 12, v0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v15, 13, v0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v16, 14, v0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v17, 15, v0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v13, v3, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v11, v3, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v14, v8, off
 ; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; UNALIGNED_GFX10-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc dlc
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v15, v3, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v16, v3, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v17, v3, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v0, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v4, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v5, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v1, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v6, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v7, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v9, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v2, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v10, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v12, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v13, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v11, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v14, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v15, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v16, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v17, off glc dlc
 ; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; UNALIGNED_GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; UNALIGNED_GFX940-LABEL: store_load_v4i32_unaligned:
 ; UNALIGNED_GFX940:       ; %bb.0: ; %bb
 ; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; UNALIGNED_GFX940-NEXT:    s_mov_b32 s3, 4
-; UNALIGNED_GFX940-NEXT:    s_mov_b32 s2, 3
-; UNALIGNED_GFX940-NEXT:    s_mov_b32 s1, 2
-; UNALIGNED_GFX940-NEXT:    s_mov_b32 s0, 1
-; UNALIGNED_GFX940-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
-; UNALIGNED_GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
-; UNALIGNED_GFX940-NEXT:    scratch_store_dwordx4 v0, v[2:5], off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v3, 1
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v1, 2
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v2, 2, v0
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v4, 1, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v0, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v3, 0
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v6, 4
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v7, 4, v0
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v8, 6, v0
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v10, 8, v0
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v11, 10, v0
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v13, 3
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v14, 12, v0
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v15, 14, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v4, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v5, 3, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v2, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v5, v3, off sc0 sc1
 ; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
-; UNALIGNED_GFX940-NEXT:    scratch_load_dwordx4 v[0:3], v0, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v9, 5, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v7, v1, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v9, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v1, 7, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v8, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v1, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v12, 9, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v10, v13, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v12, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v13, 11, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v11, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v13, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v16, 13, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v14, v6, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v16, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v6, 15, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v15, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v6, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v0, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v4, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v2, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v5, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v7, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v9, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v8, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v1, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v10, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v12, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v11, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v13, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v14, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v16, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v15, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr11
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr4
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr15
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr10
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr7
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr13
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr5
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr14
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr12
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr8
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr2
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr1
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr9
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr16
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr0
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v0, v6, off sc0 sc1
 ; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; UNALIGNED_GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; UNALIGNED_GFX11-LABEL: store_load_v4i32_unaligned:
 ; UNALIGNED_GFX11:       ; %bb.0: ; %bb
 ; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; UNALIGNED_GFX11-NEXT:    s_mov_b32 s3, 4
-; UNALIGNED_GFX11-NEXT:    s_mov_b32 s2, 3
-; UNALIGNED_GFX11-NEXT:    s_mov_b32 s1, 2
-; UNALIGNED_GFX11-NEXT:    s_mov_b32 s0, 1
-; UNALIGNED_GFX11-NEXT:    v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2
-; UNALIGNED_GFX11-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; UNALIGNED_GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off dlc
+; UNALIGNED_GFX11-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
+; UNALIGNED_GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_add_nc_u32 v4, 1, v0
+; UNALIGNED_GFX11-NEXT:    v_dual_mov_b32 v11, 3 :: v_dual_add_nc_u32 v6, 4, v0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v0, v1, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v0
+; UNALIGNED_GFX11-NEXT:    v_dual_mov_b32 v8, 4 :: v_dual_add_nc_u32 v5, 2, v0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v7, 5, v0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v4, v3, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v5, v3, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v1, v3, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v9, 6, v0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v6, v2, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v7, v3, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v2, 7, v0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v10, 8, v0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v12, 9, v0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v9, v3, off dlc
 ; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; UNALIGNED_GFX11-NEXT:    scratch_load_b128 v[0:3], v0, off glc dlc
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v13, 10, v0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v2, v3, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v10, v11, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v12, v3, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v11, 11, v0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v14, 12, v0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v15, 13, v0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v16, 14, v0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v17, 15, v0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v13, v3, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v11, v3, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v14, v8, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v15, v3, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v16, v3, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v17, v3, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v0, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v4, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v5, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v1, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v6, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v7, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v9, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v2, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v10, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v12, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v13, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v11, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v14, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v15, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v16, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v17, off glc dlc
 ; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; UNALIGNED_GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2810,17 +3586,74 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; UNALIGNED_GFX12-NEXT:    s_wait_samplecnt 0x0
 ; UNALIGNED_GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; UNALIGNED_GFX12-NEXT:    s_wait_kmcnt 0x0
-; UNALIGNED_GFX12-NEXT:    s_mov_b32 s3, 4
-; UNALIGNED_GFX12-NEXT:    s_mov_b32 s2, 3
-; UNALIGNED_GFX12-NEXT:    s_mov_b32 s1, 2
-; UNALIGNED_GFX12-NEXT:    s_mov_b32 s0, 1
-; UNALIGNED_GFX12-NEXT:    s_wait_alu 0xfffe
-; UNALIGNED_GFX12-NEXT:    v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2
-; UNALIGNED_GFX12-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; UNALIGNED_GFX12-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 0
+; UNALIGNED_GFX12-NEXT:    v_mov_b32_e32 v3, 2
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v1, off scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:1 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:3 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    v_mov_b32_e32 v1, 3
+; UNALIGNED_GFX12-NEXT:    v_mov_b32_e32 v3, 4
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:5 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:6 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:7 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v1, off offset:8 scope:SCOPE_SYS
 ; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
-; UNALIGNED_GFX12-NEXT:    scratch_store_b128 v0, v[1:4], off scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:9 scope:SCOPE_SYS
 ; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
-; UNALIGNED_GFX12-NEXT:    scratch_load_b128 v[0:3], v0, off scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:10 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:11 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v3, off offset:12 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:13 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:14 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:15 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:1 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:2 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:3 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:4 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:5 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:6 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:7 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:8 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:9 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:10 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:11 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:12 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:13 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:14 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v0, v0, off offset:15 scope:SCOPE_SYS
 ; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
 ; UNALIGNED_GFX12-NEXT:    s_setpc_b64 s[30:31]
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir
index b1d7d36f9912e7..032ca7c0d4fee9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir
@@ -483,40 +483,22 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1)
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_s16_align1
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; GFX12-LABEL: name: test_load_flat_s16_align1
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s16_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -664,40 +646,22 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2)
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_s32_align2
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; GFX12-LABEL: name: test_load_flat_s32_align2
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s32_align2
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -798,70 +762,22 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1)
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_s32_align1
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; GFX12-LABEL: name: test_load_flat_s32_align1
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s32_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -1247,76 +1163,22 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
-    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
-    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 2)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_s64_align2
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
-    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
-    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 2)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
     ;
     ; GFX12-LABEL: name: test_load_flat_s64_align2
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 2)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s64_align2
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -1485,130 +1347,22 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 1)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_s64_align1
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 1)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
     ;
     ; GFX12-LABEL: name: test_load_flat_s64_align1
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 1)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s64_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -2075,87 +1829,24 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8)
-    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10)
-    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
-    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 2)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_s96_align2
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8)
-    ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10)
-    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
-    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 2)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
     ; GFX12-LABEL: name: test_load_flat_s96_align2
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10)
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
-    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 2)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s96_align2
@@ -2369,165 +2060,24 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
-    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 1)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_s96_align1
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
-    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 1)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
     ; GFX12-LABEL: name: test_load_flat_s96_align1
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
-    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 1)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s96_align1
@@ -3334,210 +2884,24 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
-    ; GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
-    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
     ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_s128_align1
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
-    ; GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
-    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
     ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     ;
     ; GFX12-LABEL: name: test_load_flat_s128_align1
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
-    ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
-    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
     ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s128_align1
@@ -4132,133 +3496,22 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64)
-    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1), align 1)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_p1_align1
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64)
-    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1), align 1)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
     ;
     ; GFX12-LABEL: name: test_load_flat_p1_align1
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1), align 1)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p1_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -4662,79 +3915,22 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
-    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
-    ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR2]](s64)
-    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 2)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_p4_align2
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
-    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
-    ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR2]](s64)
-    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 2)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
     ;
     ; GFX12-LABEL: name: test_load_flat_p4_align2
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
-    ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR2]](s64)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 2)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p4_align2
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -4906,133 +4102,22 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR6]](s64)
-    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 1)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_p4_align1
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR6]](s64)
-    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 1)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
     ;
     ; GFX12-LABEL: name: test_load_flat_p4_align1
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR6]](s64)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 1)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p4_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -5274,43 +4359,22 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
-    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 2)
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](p5)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_p5_align2
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
-    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 2)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](p5)
     ;
     ; GFX12-LABEL: name: test_load_flat_p5_align2
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
-    ; GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 2)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p5_align2
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -5416,73 +4480,22 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
-    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 1)
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](p5)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_p5_align1
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
-    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 1)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](p5)
     ;
     ; GFX12-LABEL: name: test_load_flat_p5_align1
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
-    ; GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p5_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -5732,40 +4745,22 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1)
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_v2s8_align1
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; GFX12-LABEL: name: test_load_flat_v2s8_align1
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s8_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -6158,121 +5153,106 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16), align 1)
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
     ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
-    ; GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32)
+    ; GFX9PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
     ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
-    ; GFX9PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
+    ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; GFX9PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]]
     ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX9PLUS-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
-    ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16)
-    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]]
+    ; GFX9PLUS-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]]
+    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]]
     ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-    ; GFX9PLUS-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]]
+    ; GFX9PLUS-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]]
     ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
-    ; GFX9PLUS-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]]
-    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16)
-    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]]
-    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
-    ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
-    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
-    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ; GFX9PLUS-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]]
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]]
+    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+    ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]]
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR3]](s32)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_v3s8_align1
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16), align 1)
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
     ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
-    ; GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32)
+    ; GFX11PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
     ; GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
-    ; GFX11PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
+    ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; GFX11PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]]
     ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX11PLUS-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
-    ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16)
-    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]]
+    ; GFX11PLUS-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]]
+    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16)
+    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]]
     ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-    ; GFX11PLUS-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]]
+    ; GFX11PLUS-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]]
     ; GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
-    ; GFX11PLUS-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]]
-    ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16)
-    ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]]
-    ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
-    ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
-    ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
-    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ; GFX11PLUS-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]]
+    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16)
+    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]]
+    ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+    ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]]
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR3]](s32)
     ;
     ; GFX12-LABEL: name: test_load_flat_v3s8_align1
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16), align 1)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
     ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
-    ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
-    ; GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32)
+    ; GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
     ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
-    ; GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]]
     ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
-    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]]
+    ; GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]]
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]]
     ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-    ; GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]]
+    ; GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]]
     ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
-    ; GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]]
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]]
-    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
-    ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
-    ; GFX12-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ; GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]]
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]]
+    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+    ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[OR3]](s32)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v3s8_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -6503,40 +5483,22 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2)
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_v4s8_align2
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; GFX12-LABEL: name: test_load_flat_v4s8_align2
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s8_align2
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -6638,70 +5600,22 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1)
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_v4s8_align1
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; GFX12-LABEL: name: test_load_flat_v4s8_align1
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s8_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -7185,40 +6099,22 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
-    ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 2)
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_v2s16_align2
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
-    ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 2)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
     ;
     ; GFX12-LABEL: name: test_load_flat_v2s16_align2
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
-    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX12-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 2)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s16_align2
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -7327,70 +6223,22 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 1)
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_v2s16_align1
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 1)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
     ;
     ; GFX12-LABEL: name: test_load_flat_v2s16_align1
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX12-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s16_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -8291,36 +7139,22 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1)
+    ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
     ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2, align 1)
+    ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4, align 1)
+    ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
     ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
     ; GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
     ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
     ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
-    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32)
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
     ; GFX9PLUS-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX9PLUS-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
     ; GFX9PLUS-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
@@ -8334,36 +7168,22 @@ body: |
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1)
+    ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
     ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2, align 1)
+    ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4, align 1)
+    ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
     ; GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
     ; GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
     ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
     ; GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
-    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32)
+    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
     ; GFX11PLUS-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX11PLUS-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
     ; GFX11PLUS-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
@@ -8377,36 +7197,22 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1)
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
     ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2, align 1)
+    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4, align 1)
+    ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
     ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
     ; GFX12-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
     ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
     ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32)
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
     ; GFX12-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
     ; GFX12-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
@@ -8765,70 +7571,22 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
-    ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
-    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32)
-    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-    ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
-    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 2)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_v4s16_align2
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
-    ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
-    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32)
-    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX11PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-    ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
-    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 2)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
     ;
     ; GFX12-LABEL: name: test_load_flat_v4s16_align2
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
-    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32)
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-    ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 2)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s16_align2
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -9005,124 +7763,22 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
-    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
-    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32)
-    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-    ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
-    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 1)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_v4s16_align1
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
-    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
-    ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32)
-    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX11PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-    ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
-    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 1)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
     ;
     ; GFX12-LABEL: name: test_load_flat_v4s16_align1
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32)
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-    ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 1)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s16_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -10686,133 +9342,22 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
-    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
-    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8)
-    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10)
-    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD2]]
-    ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR3]](s32)
-    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s16) from unknown-address + 12)
-    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s16) from unknown-address + 14)
-    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD3]]
-    ; GFX9PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR4]](s32)
-    ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SHL5]], [[ZEXT1]]
-    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR2]](s64), [[OR5]](s64)
-    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 2)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_v2s64_align2
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
-    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
-    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8)
-    ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10)
-    ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD2]]
-    ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR3]](s32)
-    ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s16) from unknown-address + 12)
-    ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s16) from unknown-address + 14)
-    ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD3]]
-    ; GFX11PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR4]](s32)
-    ; GFX11PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SHL5]], [[ZEXT1]]
-    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR2]](s64), [[OR5]](s64)
-    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 2)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
     ;
     ; GFX12-LABEL: name: test_load_flat_v2s64_align2
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR3]](s32)
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s16) from unknown-address + 12)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s16) from unknown-address + 14)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR4]](s32)
-    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SHL5]], [[ZEXT1]]
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR2]](s64), [[OR5]](s64)
-    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 2)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s64_align2
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -11100,235 +9645,22 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
-    ; GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
-    ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
-    ; GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
-    ; GFX9PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
-    ; GFX9PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
-    ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX9PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; GFX9PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
-    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
-    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_v2s64_align1
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
-    ; GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
-    ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
-    ; GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
-    ; GFX11PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
-    ; GFX11PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
-    ; GFX11PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX11PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; GFX11PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
-    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
-    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
     ;
     ; GFX12-LABEL: name: test_load_flat_v2s64_align1
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
-    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
-    ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
-    ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
-    ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
-    ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
-    ; GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
-    ; GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
-    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
-    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s64_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -12078,342 +10410,42 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1)
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
-    ; GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
-    ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
-    ; GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
-    ; GFX9PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
-    ; GFX9PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
-    ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX9PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; GFX9PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
-    ; GFX9PLUS-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; GFX9PLUS-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16)
-    ; GFX9PLUS-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17)
-    ; GFX9PLUS-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18)
-    ; GFX9PLUS-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19)
-    ; GFX9PLUS-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]]
-    ; GFX9PLUS-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]]
-    ; GFX9PLUS-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32)
-    ; GFX9PLUS-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20)
-    ; GFX9PLUS-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21)
-    ; GFX9PLUS-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22)
-    ; GFX9PLUS-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23)
-    ; GFX9PLUS-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]]
-    ; GFX9PLUS-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]]
-    ; GFX9PLUS-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32)
-    ; GFX9PLUS-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX9PLUS-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32)
-    ; GFX9PLUS-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]]
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16, align 1)
+    ; GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>)
     ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
-    ; GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
-    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[UV3]](s64)
+    ; GFX9PLUS-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
     ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_v3s64_align1
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1)
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
-    ; GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
-    ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
-    ; GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
-    ; GFX11PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
-    ; GFX11PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
-    ; GFX11PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX11PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; GFX11PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
-    ; GFX11PLUS-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; GFX11PLUS-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16)
-    ; GFX11PLUS-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17)
-    ; GFX11PLUS-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18)
-    ; GFX11PLUS-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19)
-    ; GFX11PLUS-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]]
-    ; GFX11PLUS-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]]
-    ; GFX11PLUS-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32)
-    ; GFX11PLUS-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20)
-    ; GFX11PLUS-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21)
-    ; GFX11PLUS-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22)
-    ; GFX11PLUS-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23)
-    ; GFX11PLUS-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]]
-    ; GFX11PLUS-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]]
-    ; GFX11PLUS-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32)
-    ; GFX11PLUS-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX11PLUS-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32)
-    ; GFX11PLUS-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]]
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16, align 1)
+    ; GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>)
     ; GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
-    ; GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
-    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[UV3]](s64)
+    ; GFX11PLUS-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
     ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
     ;
     ; GFX12-LABEL: name: test_load_flat_v3s64_align1
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
-    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
-    ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
-    ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
-    ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
-    ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
-    ; GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
-    ; GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
-    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
-    ; GFX12-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; GFX12-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16)
-    ; GFX12-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17)
-    ; GFX12-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]]
-    ; GFX12-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18)
-    ; GFX12-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19)
-    ; GFX12-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]]
-    ; GFX12-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]]
-    ; GFX12-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32)
-    ; GFX12-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20)
-    ; GFX12-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21)
-    ; GFX12-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]]
-    ; GFX12-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22)
-    ; GFX12-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23)
-    ; GFX12-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]]
-    ; GFX12-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]]
-    ; GFX12-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32)
-    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX12-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32)
-    ; GFX12-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]]
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16, align 1)
+    ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>)
     ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
-    ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[UV3]](s64)
+    ; GFX12-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
     ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v3s64_align1
@@ -13306,441 +11338,33 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1)
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
-    ; GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
-    ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
-    ; GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
-    ; GFX9PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
-    ; GFX9PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
-    ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX9PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; GFX9PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
-    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
-    ; GFX9PLUS-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; GFX9PLUS-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16)
-    ; GFX9PLUS-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17)
-    ; GFX9PLUS-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18)
-    ; GFX9PLUS-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19)
-    ; GFX9PLUS-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]]
-    ; GFX9PLUS-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]]
-    ; GFX9PLUS-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32)
-    ; GFX9PLUS-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20)
-    ; GFX9PLUS-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21)
-    ; GFX9PLUS-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22)
-    ; GFX9PLUS-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23)
-    ; GFX9PLUS-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]]
-    ; GFX9PLUS-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]]
-    ; GFX9PLUS-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32)
-    ; GFX9PLUS-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX9PLUS-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32)
-    ; GFX9PLUS-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD23:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C6]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD18:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD23]](p0) :: (load (s8) from unknown-address + 24)
-    ; GFX9PLUS-NEXT: [[PTR_ADD24:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD19:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD24]](p0) :: (load (s8) from unknown-address + 25)
-    ; GFX9PLUS-NEXT: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD19]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR21:%[0-9]+]]:_(s32) = G_OR [[SHL21]], [[ZEXTLOAD18]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD25:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD20:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD25]](p0) :: (load (s8) from unknown-address + 26)
-    ; GFX9PLUS-NEXT: [[PTR_ADD26:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD25]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD26]](p0) :: (load (s8) from unknown-address + 27)
-    ; GFX9PLUS-NEXT: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[LOAD6]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR22:%[0-9]+]]:_(s32) = G_OR [[SHL22]], [[ZEXTLOAD20]]
-    ; GFX9PLUS-NEXT: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[OR22]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR23:%[0-9]+]]:_(s32) = G_OR [[SHL23]], [[OR21]]
-    ; GFX9PLUS-NEXT: [[ZEXT3:%[0-9]+]]:_(s64) = G_ZEXT [[OR23]](s32)
-    ; GFX9PLUS-NEXT: [[PTR_ADD27:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C4]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD21:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD27]](p0) :: (load (s8) from unknown-address + 28)
-    ; GFX9PLUS-NEXT: [[PTR_ADD28:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD22:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD28]](p0) :: (load (s8) from unknown-address + 29)
-    ; GFX9PLUS-NEXT: [[SHL24:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD22]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR24:%[0-9]+]]:_(s32) = G_OR [[SHL24]], [[ZEXTLOAD21]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD29:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD23:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD29]](p0) :: (load (s8) from unknown-address + 30)
-    ; GFX9PLUS-NEXT: [[PTR_ADD30:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD29]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD30]](p0) :: (load (s8) from unknown-address + 31)
-    ; GFX9PLUS-NEXT: [[SHL25:%[0-9]+]]:_(s32) = G_SHL [[LOAD7]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR25:%[0-9]+]]:_(s32) = G_OR [[SHL25]], [[ZEXTLOAD23]]
-    ; GFX9PLUS-NEXT: [[SHL26:%[0-9]+]]:_(s32) = G_SHL [[OR25]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR26:%[0-9]+]]:_(s32) = G_OR [[SHL26]], [[OR24]]
-    ; GFX9PLUS-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[OR26]](s32)
-    ; GFX9PLUS-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX9PLUS-NEXT: [[SHL27:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT3]], [[COPY3]](s32)
-    ; GFX9PLUS-NEXT: [[OR27:%[0-9]+]]:_(s64) = G_OR [[SHL27]], [[ZEXT3]]
-    ; GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR20]](s64), [[OR27]](s64)
-    ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s64>), [[BUILD_VECTOR1]](<2 x s64>)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16, align 1)
+    ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
     ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_v4s64_align1
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1)
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
-    ; GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
-    ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
-    ; GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
-    ; GFX11PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
-    ; GFX11PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
-    ; GFX11PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX11PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; GFX11PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
-    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
-    ; GFX11PLUS-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; GFX11PLUS-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16)
-    ; GFX11PLUS-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17)
-    ; GFX11PLUS-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18)
-    ; GFX11PLUS-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19)
-    ; GFX11PLUS-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]]
-    ; GFX11PLUS-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]]
-    ; GFX11PLUS-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32)
-    ; GFX11PLUS-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20)
-    ; GFX11PLUS-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21)
-    ; GFX11PLUS-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22)
-    ; GFX11PLUS-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23)
-    ; GFX11PLUS-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]]
-    ; GFX11PLUS-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]]
-    ; GFX11PLUS-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32)
-    ; GFX11PLUS-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX11PLUS-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32)
-    ; GFX11PLUS-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD23:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C6]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD18:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD23]](p0) :: (load (s8) from unknown-address + 24)
-    ; GFX11PLUS-NEXT: [[PTR_ADD24:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD19:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD24]](p0) :: (load (s8) from unknown-address + 25)
-    ; GFX11PLUS-NEXT: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD19]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR21:%[0-9]+]]:_(s32) = G_OR [[SHL21]], [[ZEXTLOAD18]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD25:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD20:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD25]](p0) :: (load (s8) from unknown-address + 26)
-    ; GFX11PLUS-NEXT: [[PTR_ADD26:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD25]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD26]](p0) :: (load (s8) from unknown-address + 27)
-    ; GFX11PLUS-NEXT: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[LOAD6]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR22:%[0-9]+]]:_(s32) = G_OR [[SHL22]], [[ZEXTLOAD20]]
-    ; GFX11PLUS-NEXT: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[OR22]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR23:%[0-9]+]]:_(s32) = G_OR [[SHL23]], [[OR21]]
-    ; GFX11PLUS-NEXT: [[ZEXT3:%[0-9]+]]:_(s64) = G_ZEXT [[OR23]](s32)
-    ; GFX11PLUS-NEXT: [[PTR_ADD27:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C4]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD21:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD27]](p0) :: (load (s8) from unknown-address + 28)
-    ; GFX11PLUS-NEXT: [[PTR_ADD28:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD22:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD28]](p0) :: (load (s8) from unknown-address + 29)
-    ; GFX11PLUS-NEXT: [[SHL24:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD22]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR24:%[0-9]+]]:_(s32) = G_OR [[SHL24]], [[ZEXTLOAD21]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD29:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD23:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD29]](p0) :: (load (s8) from unknown-address + 30)
-    ; GFX11PLUS-NEXT: [[PTR_ADD30:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD29]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD30]](p0) :: (load (s8) from unknown-address + 31)
-    ; GFX11PLUS-NEXT: [[SHL25:%[0-9]+]]:_(s32) = G_SHL [[LOAD7]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR25:%[0-9]+]]:_(s32) = G_OR [[SHL25]], [[ZEXTLOAD23]]
-    ; GFX11PLUS-NEXT: [[SHL26:%[0-9]+]]:_(s32) = G_SHL [[OR25]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR26:%[0-9]+]]:_(s32) = G_OR [[SHL26]], [[OR24]]
-    ; GFX11PLUS-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[OR26]](s32)
-    ; GFX11PLUS-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX11PLUS-NEXT: [[SHL27:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT3]], [[COPY3]](s32)
-    ; GFX11PLUS-NEXT: [[OR27:%[0-9]+]]:_(s64) = G_OR [[SHL27]], [[ZEXT3]]
-    ; GFX11PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR20]](s64), [[OR27]](s64)
-    ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s64>), [[BUILD_VECTOR1]](<2 x s64>)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16, align 1)
+    ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
     ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
     ;
     ; GFX12-LABEL: name: test_load_flat_v4s64_align1
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
-    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
-    ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
-    ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
-    ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
-    ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
-    ; GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
-    ; GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
-    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
-    ; GFX12-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; GFX12-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16)
-    ; GFX12-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17)
-    ; GFX12-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]]
-    ; GFX12-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18)
-    ; GFX12-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19)
-    ; GFX12-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]]
-    ; GFX12-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]]
-    ; GFX12-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32)
-    ; GFX12-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20)
-    ; GFX12-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21)
-    ; GFX12-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]]
-    ; GFX12-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22)
-    ; GFX12-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23)
-    ; GFX12-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]]
-    ; GFX12-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]]
-    ; GFX12-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32)
-    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX12-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32)
-    ; GFX12-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]]
-    ; GFX12-NEXT: [[PTR_ADD23:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C6]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD18:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD23]](p0) :: (load (s8) from unknown-address + 24)
-    ; GFX12-NEXT: [[PTR_ADD24:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD19:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD24]](p0) :: (load (s8) from unknown-address + 25)
-    ; GFX12-NEXT: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD19]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR21:%[0-9]+]]:_(s32) = G_OR [[SHL21]], [[ZEXTLOAD18]]
-    ; GFX12-NEXT: [[PTR_ADD25:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD20:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD25]](p0) :: (load (s8) from unknown-address + 26)
-    ; GFX12-NEXT: [[PTR_ADD26:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD25]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD26]](p0) :: (load (s8) from unknown-address + 27)
-    ; GFX12-NEXT: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[LOAD6]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR22:%[0-9]+]]:_(s32) = G_OR [[SHL22]], [[ZEXTLOAD20]]
-    ; GFX12-NEXT: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[OR22]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR23:%[0-9]+]]:_(s32) = G_OR [[SHL23]], [[OR21]]
-    ; GFX12-NEXT: [[ZEXT3:%[0-9]+]]:_(s64) = G_ZEXT [[OR23]](s32)
-    ; GFX12-NEXT: [[PTR_ADD27:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD21:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD27]](p0) :: (load (s8) from unknown-address + 28)
-    ; GFX12-NEXT: [[PTR_ADD28:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD22:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD28]](p0) :: (load (s8) from unknown-address + 29)
-    ; GFX12-NEXT: [[SHL24:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD22]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR24:%[0-9]+]]:_(s32) = G_OR [[SHL24]], [[ZEXTLOAD21]]
-    ; GFX12-NEXT: [[PTR_ADD29:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD23:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD29]](p0) :: (load (s8) from unknown-address + 30)
-    ; GFX12-NEXT: [[PTR_ADD30:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD29]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD30]](p0) :: (load (s8) from unknown-address + 31)
-    ; GFX12-NEXT: [[SHL25:%[0-9]+]]:_(s32) = G_SHL [[LOAD7]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR25:%[0-9]+]]:_(s32) = G_OR [[SHL25]], [[ZEXTLOAD23]]
-    ; GFX12-NEXT: [[SHL26:%[0-9]+]]:_(s32) = G_SHL [[OR25]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR26:%[0-9]+]]:_(s32) = G_OR [[SHL26]], [[OR24]]
-    ; GFX12-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[OR26]](s32)
-    ; GFX12-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX12-NEXT: [[SHL27:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT3]], [[COPY3]](s32)
-    ; GFX12-NEXT: [[OR27:%[0-9]+]]:_(s64) = G_OR [[SHL27]], [[ZEXT3]]
-    ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR20]](s64), [[OR27]](s64)
-    ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s64>), [[BUILD_VECTOR1]](<2 x s64>)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16, align 1)
+    ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
     ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s64_align1
@@ -14762,210 +12386,24 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
-    ; GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
-    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_v2p1_align1
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
-    ; GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
-    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
     ;
     ; GFX12-LABEL: name: test_load_flat_v2p1_align1
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
-    ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
-    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2p1_align1
@@ -15422,124 +12860,22 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
-    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9PLUS-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32)
-    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3)
-    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>), align 1)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_v2p3_align1
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
-    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX11PLUS-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32)
-    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3)
-    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>), align 1)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ;
     ; GFX12-LABEL: name: test_load_flat_v2p3_align1
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32)
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>), align 1)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2p3_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir
index 741f878c86f8b6..6d93112aae1a06 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir
@@ -636,27 +636,15 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
+    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; GFX10-LABEL: name: test_load_private_s16_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
+    ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; GFX11-LABEL: name: test_load_private_s16_align1
     ; GFX11: liveins: $vgpr0
@@ -702,15 +690,27 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR]](s32)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_s16_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
     %0:_(p5) = COPY $vgpr0
     %1:_(s16) = G_LOAD %0 :: (load (s16), align 1, addrspace 5)
     %2:_(s32) = G_ANYEXT %1
@@ -853,27 +853,15 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5)
+    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; GFX10-LABEL: name: test_load_private_s32_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5)
+    ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; GFX11-LABEL: name: test_load_private_s32_align2
     ; GFX11: liveins: $vgpr0
@@ -919,15 +907,27 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR]](s32)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_s32_align2
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
     %0:_(p5) = COPY $vgpr0
     %1:_(s32) = G_LOAD %0 :: (load (s32), align 2, addrspace 5)
     $vgpr0 = COPY %1
@@ -1012,47 +1012,15 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; GFX10-LABEL: name: test_load_private_s32_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX10-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; GFX11-LABEL: name: test_load_private_s32_align1
     ; GFX11: liveins: $vgpr0
@@ -1118,15 +1086,47 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR2]](s32)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_s32_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR2]](s32)
     %0:_(p5) = COPY $vgpr0
     %1:_(s32) = G_LOAD %0 :: (load (s32), align 1, addrspace 5)
     $vgpr0 = COPY %1
@@ -1529,39 +1529,27 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
-    ; GFX9-NEXT: $vgpr0 = COPY [[OR1]](s32)
+    ; GFX9-NEXT: $vgpr0 = COPY [[OR]](s32)
     ;
     ; GFX10-LABEL: name: test_load_private_s24_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
-    ; GFX10-NEXT: $vgpr0 = COPY [[OR1]](s32)
+    ; GFX10-NEXT: $vgpr0 = COPY [[OR]](s32)
     ;
     ; GFX11-LABEL: name: test_load_private_s24_align1
     ; GFX11: liveins: $vgpr0
@@ -1631,27 +1619,39 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
     ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR1]](s32)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_s24_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
     ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR1]](s32)
     %0:_(p5) = COPY $vgpr0
     %1:_(s24) = G_LOAD %0 :: (load (s24), align 1, addrspace 5)
     %2:_(s32) = G_ANYEXT %1
@@ -2147,42 +2147,22 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5)
+    ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
     ;
     ; GFX10-LABEL: name: test_load_private_s64_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5)
+    ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
     ;
     ; GFX11-LABEL: name: test_load_private_s64_align2
@@ -2245,15 +2225,51 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), align 2, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_s64_align2
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), align 2, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64)
     %0:_(p5) = COPY $vgpr0
     %1:_(s64) = G_LOAD %0 :: (load (s64), align 2, addrspace 5)
     $vgpr0_vgpr1 = COPY %1
@@ -2386,78 +2402,22 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
+    ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
     ;
     ; GFX10-LABEL: name: test_load_private_s64_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
+    ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
     ;
     ; GFX11-LABEL: name: test_load_private_s64_align1
@@ -2556,15 +2516,87 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_s64_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64)
     %0:_(p5) = COPY $vgpr0
     %1:_(s64) = G_LOAD %0 :: (load (s64), align 1, addrspace 5)
     $vgpr0_vgpr1 = COPY %1
@@ -2742,53 +2774,14 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
@@ -2796,53 +2789,14 @@ body: |
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
     ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5)
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
@@ -2974,16 +2928,108 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_s96_align16
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     %0:_(p5) = COPY $vgpr0
     %1:_(s96) = G_LOAD %0 :: (load (s96), align 1, addrspace 5)
@@ -3381,28 +3427,14 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5)
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
@@ -3410,28 +3442,14 @@ body: |
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5)
+    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5)
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
@@ -3513,16 +3531,58 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 2, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_s96_align2
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 2, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     %0:_(p5) = COPY $vgpr0
     %1:_(s96) = G_LOAD %0 :: (load (s96), align 2, addrspace 5)
@@ -3701,53 +3761,14 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
@@ -3755,53 +3776,14 @@ body: |
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
     ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5)
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
@@ -3933,16 +3915,108 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_s96_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     %0:_(p5) = COPY $vgpr0
     %1:_(s96) = G_LOAD %0 :: (load (s96), align 1, addrspace 5)
@@ -4166,68 +4240,17 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
-    ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
-    ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5)
+    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     ;
@@ -4235,68 +4258,17 @@ body: |
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
     ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
-    ; GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
-    ; GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5)
+    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5)
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     ;
@@ -4458,16 +4430,138 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_s128_align16
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     %0:_(p5) = COPY $vgpr0
     %1:_(s128) = G_LOAD %0 :: (load (s128), align 1, addrspace 5)
@@ -4928,35 +5022,17 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5)
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5)
+    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 2, addrspace 5)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     ;
@@ -4964,35 +5040,17 @@ body: |
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5)
-    ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5)
+    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5)
+    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 2, addrspace 5)
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     ;
@@ -5088,16 +5146,72 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 2, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_s128_align2
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 2, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     %0:_(p5) = COPY $vgpr0
     %1:_(s128) = G_LOAD %0 :: (load (s128), align 2, addrspace 5)
@@ -5321,68 +5435,17 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
-    ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
-    ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5)
+    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     ;
@@ -5390,68 +5453,17 @@ body: |
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
     ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
-    ; GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
-    ; GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5)
+    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5)
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     ;
@@ -5613,16 +5625,138 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_s128_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     %0:_(p5) = COPY $vgpr0
     %1:_(s128) = G_LOAD %0 :: (load (s128), align 1, addrspace 5)
@@ -5932,42 +6066,22 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5)
+    ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
     ;
     ; GFX10-LABEL: name: test_load_private_p1_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5)
+    ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
     ;
     ; GFX11-LABEL: name: test_load_private_p1_align2
@@ -6030,15 +6144,53 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), align 2, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
+    ; UNALIGNED_GFX11-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR2]](s64)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_p1_align2
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), align 2, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
+    ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR2]](s64)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
     %0:_(p5) = COPY $vgpr0
     %1:_(p1) = G_LOAD %0 :: (load (p1), align 2, addrspace 5)
     $vgpr0_vgpr1 = COPY %1
@@ -6171,78 +6323,22 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
+    ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
     ;
     ; GFX10-LABEL: name: test_load_private_p1_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
+    ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
     ;
     ; GFX11-LABEL: name: test_load_private_p1_align1
@@ -6341,15 +6437,89 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; UNALIGNED_GFX11-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_p1_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
     %0:_(p5) = COPY $vgpr0
     %1:_(p1) = G_LOAD %0 :: (load (p1), align 1, addrspace 5)
     $vgpr0_vgpr1 = COPY %1
@@ -6494,29 +6664,15 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32)
-    ; GFX9-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 2, addrspace 5)
+    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](p3)
     ;
     ; GFX10-LABEL: name: test_load_private_p3_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32)
-    ; GFX10-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 2, addrspace 5)
+    ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](p3)
     ;
     ; GFX11-LABEL: name: test_load_private_p3_align2
     ; GFX11: liveins: $vgpr0
@@ -6564,15 +6720,29 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 2, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_p3_align2
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 2, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
     %0:_(p5) = COPY $vgpr0
     %1:_(p3) = G_LOAD %0 :: (load (p3), align 2, addrspace 5)
     $vgpr0 = COPY %1
@@ -6660,49 +6830,15 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
-    ; GFX9-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 1, addrspace 5)
+    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](p3)
     ;
     ; GFX10-LABEL: name: test_load_private_p3_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX10-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
-    ; GFX10-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 1, addrspace 5)
+    ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](p3)
     ;
     ; GFX11-LABEL: name: test_load_private_p3_align1
     ; GFX11: liveins: $vgpr0
@@ -6770,15 +6906,49 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_p3_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
     %0:_(p5) = COPY $vgpr0
     %1:_(p3) = G_LOAD %0 :: (load (p3), align 1, addrspace 5)
     $vgpr0 = COPY %1
@@ -6923,29 +7093,15 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
-    ; GFX9-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 2, addrspace 5)
+    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](p5)
     ;
     ; GFX10-LABEL: name: test_load_private_p5_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
-    ; GFX10-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 2, addrspace 5)
+    ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](p5)
     ;
     ; GFX11-LABEL: name: test_load_private_p5_align2
     ; GFX11: liveins: $vgpr0
@@ -6993,15 +7149,29 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 2, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_p5_align2
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 2, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
     %0:_(p5) = COPY $vgpr0
     %1:_(p5) = G_LOAD %0 :: (load (p5), align 2, addrspace 5)
     $vgpr0 = COPY %1
@@ -7089,49 +7259,15 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
-    ; GFX9-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 1, addrspace 5)
+    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](p5)
     ;
     ; GFX10-LABEL: name: test_load_private_p5_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX10-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
-    ; GFX10-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 1, addrspace 5)
+    ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](p5)
     ;
     ; GFX11-LABEL: name: test_load_private_p5_align1
     ; GFX11: liveins: $vgpr0
@@ -7199,15 +7335,49 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_p5_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
     %0:_(p5) = COPY $vgpr0
     %1:_(p5) = G_LOAD %0 :: (load (p5), align 1, addrspace 5)
     $vgpr0 = COPY %1
@@ -7357,30 +7527,20 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32)
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     ;
     ; GFX10-LABEL: name: test_load_private_v2s8_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32)
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32)
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     ;
     ; GFX11-LABEL: name: test_load_private_v2s8_align1
@@ -7437,20 +7597,30 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32)
-    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32)
     ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s8_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32)
     ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<2 x s8>) = G_LOAD %0 :: (load (<2 x s8>), align 1, addrspace 5)
@@ -7938,81 +8108,71 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
-    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
-    ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
+    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32)
+    ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
     ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
-    ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
+    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]]
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
-    ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]]
+    ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]]
+    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16)
+    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]]
     ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-    ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]]
+    ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]]
     ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
-    ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]]
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]]
-    ; GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
-    ; GFX9-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
-    ; GFX9-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]]
+    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16)
+    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]]
+    ; GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+    ; GFX9-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
+    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]]
+    ; GFX9-NEXT: $vgpr0 = COPY [[OR3]](s32)
     ;
     ; GFX10-LABEL: name: test_load_private_v3s8_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
-    ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
-    ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
+    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32)
+    ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
     ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
-    ; GFX10-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
+    ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; GFX10-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]]
     ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX10-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
-    ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]]
+    ; GFX10-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]]
+    ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16)
+    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]]
     ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-    ; GFX10-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]]
+    ; GFX10-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]]
     ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
-    ; GFX10-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]]
-    ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16)
-    ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]]
-    ; GFX10-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
-    ; GFX10-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
-    ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
-    ; GFX10-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ; GFX10-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]]
+    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16)
+    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]]
+    ; GFX10-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+    ; GFX10-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
+    ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]]
+    ; GFX10-NEXT: $vgpr0 = COPY [[OR3]](s32)
     ;
     ; GFX11-LABEL: name: test_load_private_v3s8_align1
     ; GFX11: liveins: $vgpr0
@@ -8168,71 +8328,81 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
     ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32)
-    ; UNALIGNED_GFX11-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
+    ; UNALIGNED_GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
     ; UNALIGNED_GFX11-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
-    ; UNALIGNED_GFX11-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]]
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; UNALIGNED_GFX11-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
     ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; UNALIGNED_GFX11-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]]
-    ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16)
-    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]]
+    ; UNALIGNED_GFX11-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
+    ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]]
     ; UNALIGNED_GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-    ; UNALIGNED_GFX11-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]]
+    ; UNALIGNED_GFX11-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]]
     ; UNALIGNED_GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
-    ; UNALIGNED_GFX11-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]]
-    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16)
-    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]]
-    ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
-    ; UNALIGNED_GFX11-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
-    ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
-    ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]]
-    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16)
+    ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]]
+    ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
+    ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR4]](s32)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_v3s8_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
     ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
     ; UNALIGNED_GFX12-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
-    ; UNALIGNED_GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; UNALIGNED_GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
     ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]]
-    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16)
-    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]]
+    ; UNALIGNED_GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
+    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]]
     ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]]
+    ; UNALIGNED_GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]]
     ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]]
-    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16)
-    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]]
-    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
-    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]]
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR4]](s32)
     %0:_(p5) = COPY $vgpr0
     %1:_(<3 x s8>) = G_LOAD %0 :: (load (<3 x s8>), align 1, addrspace 5)
     %2:_(s24) = G_BITCAST %1
@@ -8658,136 +8828,34 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
-    ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
-    ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5)
+    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     ;
     ; GFX10-LABEL: name: test_load_private_v16s8_align16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
     ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
-    ; GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
-    ; GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5)
+    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5)
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     ;
     ; GFX11-LABEL: name: test_load_private_v16s8_align16
@@ -8944,15 +9012,137 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_v16s8_align16
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<16 x s8>) = G_LOAD %0 :: (load (<16 x s8>), align 1, addrspace 5)
     %2:_(<4 x s32>) = G_BITCAST %1
@@ -9107,27 +9297,15 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5)
+    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
     ;
     ; GFX10-LABEL: name: test_load_private_v2s16_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5)
+    ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
     ;
     ; GFX11-LABEL: name: test_load_private_v2s16_align2
     ; GFX11: liveins: $vgpr0
@@ -9173,15 +9351,27 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s16_align2
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 2, addrspace 5)
     $vgpr0 = COPY %1
@@ -9278,47 +9468,15 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5)
+    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
     ;
     ; GFX10-LABEL: name: test_load_private_v2s16_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5)
+    ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
     ;
     ; GFX11-LABEL: name: test_load_private_v2s16_align1
     ; GFX11: liveins: $vgpr0
@@ -9384,15 +9542,47 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s16_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 1, addrspace 5)
     $vgpr0 = COPY %1
@@ -9824,27 +10014,26 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[LOAD]](<2 x s16>)
+    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32)
+    ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
     ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
-    ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
-    ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
-    ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
+    ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16)
+    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC3]](s16)
     ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
     ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
@@ -9853,27 +10042,26 @@ body: |
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[LOAD]](<2 x s16>)
+    ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32)
+    ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
     ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
-    ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
-    ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
-    ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
-    ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
+    ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16)
+    ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC3]](s16)
     ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
     ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
@@ -10215,41 +10403,26 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 4, align 1, addrspace 5)
+    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[LOAD]](<2 x s16>)
+    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32)
+    ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
     ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
-    ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32)
-    ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
-    ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
+    ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16)
+    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC3]](s16)
     ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
     ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
@@ -10258,41 +10431,26 @@ body: |
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 4, align 1, addrspace 5)
+    ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[LOAD]](<2 x s16>)
+    ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32)
+    ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
     ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
-    ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
-    ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
-    ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32)
-    ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
-    ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
+    ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16)
+    ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC3]](s16)
     ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
     ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
@@ -10445,22 +10603,36 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
     ; UNALIGNED_GFX11-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
     ; UNALIGNED_GFX11-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
     ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
     ; UNALIGNED_GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
-    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32)
     ; UNALIGNED_GFX11-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; UNALIGNED_GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
     ; UNALIGNED_GFX11-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
@@ -10474,22 +10646,36 @@ body: |
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
     ; UNALIGNED_GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
     ; UNALIGNED_GFX12-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
     ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
     ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32)
     ; UNALIGNED_GFX12-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
     ; UNALIGNED_GFX12-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
@@ -10827,44 +11013,22 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-    ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD]](p5) :: (load (<2 x s16>) from unknown-address + 4, align 2, addrspace 5)
+    ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
     ;
     ; GFX10-LABEL: name: test_load_private_v4s16_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32)
-    ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-    ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD]](p5) :: (load (<2 x s16>) from unknown-address + 4, align 2, addrspace 5)
+    ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
     ;
     ; GFX11-LABEL: name: test_load_private_v4s16_align2
@@ -10929,15 +11093,47 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 2, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_v4s16_align2
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 2, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 2, addrspace 5)
     $vgpr0_vgpr1 = COPY %1
@@ -11091,80 +11287,22 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-    ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD]](p5) :: (load (<2 x s16>) from unknown-address + 4, align 1, addrspace 5)
+    ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
     ;
     ; GFX10-LABEL: name: test_load_private_v4s16_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
-    ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32)
-    ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-    ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD]](p5) :: (load (<2 x s16>) from unknown-address + 4, align 1, addrspace 5)
+    ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
     ;
     ; GFX11-LABEL: name: test_load_private_v4s16_align1
@@ -11265,15 +11403,83 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_v4s16_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 1, addrspace 5)
     $vgpr0_vgpr1 = COPY %1
@@ -11582,42 +11788,22 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     ;
     ; GFX10-LABEL: name: test_load_private_v2s32_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5)
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     ;
     ; GFX11-LABEL: name: test_load_private_v2s32_align2
@@ -11680,15 +11866,43 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), align 2, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s32_align2
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), align 2, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<2 x s32>) = G_LOAD %0 :: (load (<2 x s32>), align 2, addrspace 5)
     $vgpr0_vgpr1 = COPY %1
@@ -11821,78 +12035,22 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     ;
     ; GFX10-LABEL: name: test_load_private_v2s32_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     ;
     ; GFX11-LABEL: name: test_load_private_v2s32_align1
@@ -11991,15 +12149,79 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s32_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<2 x s32>) = G_LOAD %0 :: (load (<2 x s32>), align 1, addrspace 5)
     $vgpr0_vgpr1 = COPY %1
@@ -12174,106 +12396,28 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
     ;
     ; GFX10-LABEL: name: test_load_private_v3s32_align16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
     ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5)
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
     ;
     ; GFX11-LABEL: name: test_load_private_v3s32_align16
@@ -12400,15 +12544,107 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_v3s32_align16
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<3 x s32>) = G_LOAD %0 :: (load (<3 x s32>), align 1, addrspace 5)
     $vgpr0_vgpr1_vgpr2 = COPY %1
@@ -12764,136 +13000,34 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
-    ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
-    ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5)
+    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     ;
     ; GFX10-LABEL: name: test_load_private_v4s32_align16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
     ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
-    ; GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
-    ; GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5)
+    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5)
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     ;
     ; GFX11-LABEL: name: test_load_private_v4s32_align16
@@ -13050,15 +13184,137 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_v4s32_align16
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 1, addrspace 5)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -13493,70 +13749,34 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5)
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5)
+    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 2, addrspace 5)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     ;
     ; GFX10-LABEL: name: test_load_private_v4s32_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5)
-    ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5)
+    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5)
+    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 2, addrspace 5)
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     ;
     ; GFX11-LABEL: name: test_load_private_v4s32_align2
@@ -13647,15 +13867,71 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 2, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_v4s32_align2
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 2, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 2, addrspace 5)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -13875,136 +14151,34 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
-    ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
-    ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5)
+    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     ;
     ; GFX10-LABEL: name: test_load_private_v4s32_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
     ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
-    ; GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
-    ; GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5)
+    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5)
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     ;
     ; GFX11-LABEL: name: test_load_private_v4s32_align1
@@ -14161,15 +14335,137 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_v4s32_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 1, addrspace 5)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -15262,68 +15558,17 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
+    ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5)
     ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32)
-    ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
-    ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
-    ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s32), [[OR11]](s32)
+    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5)
+    ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
     ;
@@ -15331,68 +15576,17 @@ body: |
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
+    ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5)
     ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32)
-    ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
-    ; GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
-    ; GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s32), [[OR11]](s32)
+    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5)
+    ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
     ;
@@ -15552,15 +15746,155 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p5) :: (load (<2 x s64>), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
+    ; UNALIGNED_GFX11-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
+    ; UNALIGNED_GFX11-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s64_align16
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p5) :: (load (<2 x s64>), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<2 x s64>) = G_LOAD %0 :: (load (<2 x s64>), align 1, addrspace 5)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -18178,98 +18512,23 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
-    ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
-    ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
-    ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX9-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C4]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p5) :: (load (s8) from unknown-address + 16, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p5) :: (load (s8) from unknown-address + 17, addrspace 5)
-    ; GFX9-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[ZEXTLOAD12]]
-    ; GFX9-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p5) :: (load (s8) from unknown-address + 18, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD17]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p5) :: (load (s8) from unknown-address + 19, addrspace 5)
-    ; GFX9-NEXT: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR13:%[0-9]+]]:_(s32) = G_OR [[SHL13]], [[ZEXTLOAD14]]
-    ; GFX9-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[OR13]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[OR12]]
-    ; GFX9-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p5) :: (load (s8) from unknown-address + 20, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p5) :: (load (s8) from unknown-address + 21, addrspace 5)
-    ; GFX9-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD15]]
-    ; GFX9-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p5) :: (load (s8) from unknown-address + 22, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD21]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p5) :: (load (s8) from unknown-address + 23, addrspace 5)
-    ; GFX9-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[ZEXTLOAD17]]
-    ; GFX9-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[OR16]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[OR15]]
-    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR11]](s32), [[OR14]](s32), [[OR17]](s32)
+    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5)
+    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C]](s32)
+    ; GFX9-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load (s32) from unknown-address + 16, align 1, addrspace 5)
+    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C1]](s32)
+    ; GFX9-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s32) from unknown-address + 20, align 1, addrspace 5)
+    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32)
     ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>)
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96)
     ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
@@ -18280,98 +18539,23 @@ body: |
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
     ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5)
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
-    ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
-    ; GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
-    ; GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX10-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C4]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p5) :: (load (s8) from unknown-address + 16, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p5) :: (load (s8) from unknown-address + 17, addrspace 5)
-    ; GFX10-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[ZEXTLOAD12]]
-    ; GFX10-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p5) :: (load (s8) from unknown-address + 18, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD17]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p5) :: (load (s8) from unknown-address + 19, addrspace 5)
-    ; GFX10-NEXT: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR13:%[0-9]+]]:_(s32) = G_OR [[SHL13]], [[ZEXTLOAD14]]
-    ; GFX10-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[OR13]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[OR12]]
-    ; GFX10-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p5) :: (load (s8) from unknown-address + 20, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p5) :: (load (s8) from unknown-address + 21, addrspace 5)
-    ; GFX10-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD15]]
-    ; GFX10-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p5) :: (load (s8) from unknown-address + 22, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD21]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p5) :: (load (s8) from unknown-address + 23, addrspace 5)
-    ; GFX10-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[ZEXTLOAD17]]
-    ; GFX10-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[OR16]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[OR15]]
-    ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR11]](s32), [[OR14]](s32), [[OR17]](s32)
+    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5)
+    ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C]](s32)
+    ; GFX10-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load (s32) from unknown-address + 16, align 1, addrspace 5)
+    ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C1]](s32)
+    ; GFX10-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s32) from unknown-address + 20, align 1, addrspace 5)
+    ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32)
     ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>)
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96)
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
@@ -18616,12 +18800,99 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
-    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<3 x s32>) from unknown-address + 12, align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C4]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p5) :: (load (s8) from unknown-address + 16, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p5) :: (load (s8) from unknown-address + 17, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[ZEXTLOAD12]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p5) :: (load (s8) from unknown-address + 18, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD17]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p5) :: (load (s8) from unknown-address + 19, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR13:%[0-9]+]]:_(s32) = G_OR [[SHL13]], [[ZEXTLOAD14]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[OR13]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[OR12]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p5) :: (load (s8) from unknown-address + 20, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p5) :: (load (s8) from unknown-address + 21, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD15]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p5) :: (load (s8) from unknown-address + 22, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD21]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p5) :: (load (s8) from unknown-address + 23, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[ZEXTLOAD17]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[OR16]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[OR15]]
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR11]](s32), [[OR14]](s32), [[OR17]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>)
     ; UNALIGNED_GFX11-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96)
     ; UNALIGNED_GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
@@ -18631,12 +18902,99 @@ body: |
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<3 x s32>) from unknown-address + 12, align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p5) :: (load (s8) from unknown-address + 16, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p5) :: (load (s8) from unknown-address + 17, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[ZEXTLOAD12]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p5) :: (load (s8) from unknown-address + 18, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD17]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p5) :: (load (s8) from unknown-address + 19, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR13:%[0-9]+]]:_(s32) = G_OR [[SHL13]], [[ZEXTLOAD14]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[OR13]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[OR12]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p5) :: (load (s8) from unknown-address + 20, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p5) :: (load (s8) from unknown-address + 21, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD15]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p5) :: (load (s8) from unknown-address + 22, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD21]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p5) :: (load (s8) from unknown-address + 23, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[ZEXTLOAD17]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[OR16]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[OR15]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR11]](s32), [[OR14]](s32), [[OR17]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>)
     ; UNALIGNED_GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96)
     ; UNALIGNED_GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
@@ -18818,49 +19176,23 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5)
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s16) from unknown-address + 16, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p5) :: (load (s16) from unknown-address + 18, addrspace 5)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD4]]
-    ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C3]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s16) from unknown-address + 20, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s16) from unknown-address + 22, addrspace 5)
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR3]](s32), [[OR4]](s32), [[OR5]](s32)
+    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 2, addrspace 5)
+    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C]](s32)
+    ; GFX9-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load (s32) from unknown-address + 16, align 2, addrspace 5)
+    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C1]](s32)
+    ; GFX9-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s32) from unknown-address + 20, align 2, addrspace 5)
+    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32)
     ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>)
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96)
     ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
@@ -18871,49 +19203,23 @@ body: |
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5)
+    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5)
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
-    ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5)
-    ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s16) from unknown-address + 16, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p5) :: (load (s16) from unknown-address + 18, addrspace 5)
-    ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD4]]
-    ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C3]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s16) from unknown-address + 20, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s16) from unknown-address + 22, addrspace 5)
-    ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[ZEXTLOAD5]]
-    ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR3]](s32), [[OR4]](s32), [[OR5]](s32)
+    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 2, addrspace 5)
+    ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C]](s32)
+    ; GFX10-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load (s32) from unknown-address + 16, align 2, addrspace 5)
+    ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C1]](s32)
+    ; GFX10-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s32) from unknown-address + 20, align 2, addrspace 5)
+    ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32)
     ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>)
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96)
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
@@ -19060,12 +19366,50 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 2, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
-    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
     ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<3 x s32>) from unknown-address + 12, align 2, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s16) from unknown-address + 16, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p5) :: (load (s16) from unknown-address + 18, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD4]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s16) from unknown-address + 20, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s16) from unknown-address + 22, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR3]](s32), [[OR4]](s32), [[OR5]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>)
     ; UNALIGNED_GFX11-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96)
     ; UNALIGNED_GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
@@ -19075,12 +19419,50 @@ body: |
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 2, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
     ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<3 x s32>) from unknown-address + 12, align 2, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s16) from unknown-address + 16, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p5) :: (load (s16) from unknown-address + 18, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD4]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s16) from unknown-address + 20, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s16) from unknown-address + 22, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR3]](s32), [[OR4]](s32), [[OR5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>)
     ; UNALIGNED_GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96)
     ; UNALIGNED_GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
index ea10547da6ab7f..3fc5d0d4b279eb 100644
--- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@@ -475,8 +475,14 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    scratch_store_short off, v0, s0 offset:4
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    scratch_load_dword v0, off, s0
-; FLATSCR-NEXT:    scratch_load_dword v1, off, s0 offset:2
+; FLATSCR-NEXT:    scratch_load_ushort v0, off, s0 offset:2
+; FLATSCR-NEXT:    scratch_load_ushort v3, off, s0
+; FLATSCR-NEXT:    s_waitcnt vmcnt(1)
+; FLATSCR-NEXT:    v_mov_b32_e32 v1, v0
+; FLATSCR-NEXT:    scratch_load_short_d16_hi v1, off, s0 offset:4
+; FLATSCR-NEXT:    s_mov_b32 s0, 0x5040100
+; FLATSCR-NEXT:    s_waitcnt vmcnt(1)
+; FLATSCR-NEXT:    v_perm_b32 v0, v0, v3, s0
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; FLATSCR-NEXT:    s_endpgm
@@ -537,8 +543,13 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i
 ; FLATSCR_GFX10-NEXT:    scratch_store_short off, v0, s0 offset:4
 ; FLATSCR_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; FLATSCR_GFX10-NEXT:    s_clause 0x1
-; FLATSCR_GFX10-NEXT:    scratch_load_dword v0, off, s0
-; FLATSCR_GFX10-NEXT:    scratch_load_dword v1, off, s0 offset:2
+; FLATSCR_GFX10-NEXT:    scratch_load_ushort v0, off, s0 offset:2
+; FLATSCR_GFX10-NEXT:    scratch_load_ushort v3, off, s0
+; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(1)
+; FLATSCR_GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR_GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
+; FLATSCR_GFX10-NEXT:    scratch_load_short_d16_hi v1, off, s0 offset:4
 ; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR_GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; FLATSCR_GFX10-NEXT:    s_endpgm
@@ -561,8 +572,13 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i
 ; GFX11-NEXT:    scratch_store_b16 off, v0, off offset:4 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v0, off, off
-; GFX11-NEXT:    scratch_load_b32 v1, off, off offset:2
+; GFX11-NEXT:    scratch_load_u16 v0, off, off offset:2
+; GFX11-NEXT:    scratch_load_u16 v3, off, off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
+; GFX11-NEXT:    scratch_load_d16_hi_b16 v1, off, off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX11-NEXT:    s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll
index 0ad53083d0ff3f..12593e3760fd3e 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll
@@ -123,10 +123,8 @@ define amdgpu_kernel void @zextload_flat_i16(ptr addrspace(1) noalias %out, ptr
 }
 
 ; GCN-LABEL: flat_scratch_unaligned_load:
-; GCN: flat_load_{{ubyte|u8}}
-; GCN: flat_load_{{ubyte|u8}}
-; GCN: flat_load_{{ubyte|u8}}
-; GCN: flat_load_{{ubyte|u8}}
+; GFX9: flat_load_dword
+; GFX10PLUS: flat_load_{{dword|b32}}
 define amdgpu_kernel void @flat_scratch_unaligned_load() {
   %scratch = alloca i32, addrspace(5)
   %fptr = addrspacecast ptr addrspace(5) %scratch to ptr
@@ -136,10 +134,8 @@ define amdgpu_kernel void @flat_scratch_unaligned_load() {
 }
 
 ; GCN-LABEL: flat_scratch_unaligned_store:
-; GCN: flat_store_{{byte|b8}}
-; GCN: flat_store_{{byte|b8}}
-; GCN: flat_store_{{byte|b8}}
-; GCN: flat_store_{{byte|b8}}
+; GFX9: flat_store_dword
+; GFX10PLUS: flat_store_{{dword|b32}}
 define amdgpu_kernel void @flat_scratch_unaligned_store() {
   %scratch = alloca i32, addrspace(5)
   %fptr = addrspacecast ptr addrspace(5) %scratch to ptr
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
index 1dd18b4228fe5e..9d43efbdf07b1f 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
@@ -16,47 +16,18 @@ define void @issue63986(i64 %0, i64 %idxprom) {
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    v_mov_b32_e32 v9, s7
 ; CHECK-NEXT:    v_mov_b32_e32 v8, s6
-; CHECK-NEXT:    flat_load_ubyte v10, v[8:9] offset:5
-; CHECK-NEXT:    flat_load_ubyte v11, v[8:9] offset:6
-; CHECK-NEXT:    flat_load_ubyte v12, v[8:9] offset:7
-; CHECK-NEXT:    flat_load_ubyte v13, v[8:9] offset:3
-; CHECK-NEXT:    flat_load_ubyte v14, v[8:9] offset:2
-; CHECK-NEXT:    flat_load_ubyte v15, v[8:9] offset:1
-; CHECK-NEXT:    flat_load_ubyte v16, v[8:9]
-; CHECK-NEXT:    flat_load_ubyte v17, v[8:9] offset:4
-; CHECK-NEXT:    flat_load_ubyte v18, v[8:9] offset:13
-; CHECK-NEXT:    flat_load_ubyte v19, v[8:9] offset:14
-; CHECK-NEXT:    flat_load_ubyte v20, v[8:9] offset:15
-; CHECK-NEXT:    flat_load_ubyte v21, v[8:9] offset:11
-; CHECK-NEXT:    flat_load_ubyte v22, v[8:9] offset:10
-; CHECK-NEXT:    flat_load_ubyte v23, v[8:9] offset:9
-; CHECK-NEXT:    flat_load_ubyte v24, v[8:9] offset:8
-; CHECK-NEXT:    flat_load_ubyte v25, v[8:9] offset:12
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
 ; CHECK-NEXT:    s_add_u32 s4, s4, 1
 ; CHECK-NEXT:    s_addc_u32 s5, s5, 0
-; CHECK-NEXT:    v_add_co_u32_e32 v8, vcc, s6, v6
+; CHECK-NEXT:    v_mov_b32_e32 v13, s7
+; CHECK-NEXT:    v_add_co_u32_e32 v12, vcc, s6, v6
 ; CHECK-NEXT:    v_cmp_ge_u64_e64 s[8:9], s[4:5], 2
-; CHECK-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v7, vcc
+; CHECK-NEXT:    v_addc_co_u32_e32 v13, vcc, v13, v7, vcc
 ; CHECK-NEXT:    s_add_u32 s6, s6, 16
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_and_b64 vcc, exec, s[8:9]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[8:9], v13 offset:3
-; CHECK-NEXT:    flat_store_byte v[8:9], v14 offset:2
-; CHECK-NEXT:    flat_store_byte v[8:9], v15 offset:1
-; CHECK-NEXT:    flat_store_byte v[8:9], v16
-; CHECK-NEXT:    flat_store_byte v[8:9], v12 offset:7
-; CHECK-NEXT:    flat_store_byte v[8:9], v11 offset:6
-; CHECK-NEXT:    flat_store_byte v[8:9], v10 offset:5
-; CHECK-NEXT:    flat_store_byte v[8:9], v17 offset:4
-; CHECK-NEXT:    flat_store_byte v[8:9], v21 offset:11
-; CHECK-NEXT:    flat_store_byte v[8:9], v22 offset:10
-; CHECK-NEXT:    flat_store_byte v[8:9], v23 offset:9
-; CHECK-NEXT:    flat_store_byte v[8:9], v24 offset:8
-; CHECK-NEXT:    flat_store_byte v[8:9], v20 offset:15
-; CHECK-NEXT:    flat_store_byte v[8:9], v19 offset:14
-; CHECK-NEXT:    flat_store_byte v[8:9], v18 offset:13
-; CHECK-NEXT:    flat_store_byte v[8:9], v25 offset:12
+; CHECK-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_2
 ; CHECK-NEXT:  ; %bb.3: ; %loop-memcpy-residual-header
 ; CHECK-NEXT:    s_mov_b32 s4, 0
@@ -128,47 +99,18 @@ define void @issue63986(i64 %0, i64 %idxprom) {
 ; CHECK-NEXT:    ; => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    v_mov_b32_e32 v10, s10
 ; CHECK-NEXT:    v_mov_b32_e32 v11, s11
-; CHECK-NEXT:    flat_load_ubyte v12, v[10:11] offset:5
-; CHECK-NEXT:    flat_load_ubyte v13, v[10:11] offset:6
-; CHECK-NEXT:    flat_load_ubyte v14, v[10:11] offset:7
-; CHECK-NEXT:    flat_load_ubyte v15, v[10:11] offset:3
-; CHECK-NEXT:    flat_load_ubyte v16, v[10:11] offset:2
-; CHECK-NEXT:    flat_load_ubyte v17, v[10:11] offset:1
-; CHECK-NEXT:    flat_load_ubyte v18, v[10:11]
-; CHECK-NEXT:    flat_load_ubyte v19, v[10:11] offset:4
-; CHECK-NEXT:    flat_load_ubyte v20, v[10:11] offset:13
-; CHECK-NEXT:    flat_load_ubyte v21, v[10:11] offset:14
-; CHECK-NEXT:    flat_load_ubyte v22, v[10:11] offset:15
-; CHECK-NEXT:    flat_load_ubyte v23, v[10:11] offset:11
-; CHECK-NEXT:    flat_load_ubyte v24, v[10:11] offset:10
-; CHECK-NEXT:    flat_load_ubyte v25, v[10:11] offset:9
-; CHECK-NEXT:    flat_load_ubyte v26, v[10:11] offset:8
-; CHECK-NEXT:    flat_load_ubyte v27, v[10:11] offset:12
+; CHECK-NEXT:    flat_load_dwordx4 v[10:13], v[10:11]
+; CHECK-NEXT:    v_mov_b32_e32 v15, s11
 ; CHECK-NEXT:    s_add_u32 s14, s14, 1
-; CHECK-NEXT:    v_add_co_u32_e32 v10, vcc, s10, v2
-; CHECK-NEXT:    v_addc_co_u32_e32 v11, vcc, v11, v3, vcc
+; CHECK-NEXT:    v_add_co_u32_e32 v14, vcc, s10, v2
+; CHECK-NEXT:    v_addc_co_u32_e32 v15, vcc, v15, v3, vcc
 ; CHECK-NEXT:    s_addc_u32 s15, s15, 0
 ; CHECK-NEXT:    s_add_u32 s10, s10, 16
 ; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc, s[14:15], v[4:5]
 ; CHECK-NEXT:    s_addc_u32 s11, s11, 0
 ; CHECK-NEXT:    s_or_b64 s[12:13], vcc, s[12:13]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[10:11], v15 offset:3
-; CHECK-NEXT:    flat_store_byte v[10:11], v16 offset:2
-; CHECK-NEXT:    flat_store_byte v[10:11], v17 offset:1
-; CHECK-NEXT:    flat_store_byte v[10:11], v18
-; CHECK-NEXT:    flat_store_byte v[10:11], v14 offset:7
-; CHECK-NEXT:    flat_store_byte v[10:11], v13 offset:6
-; CHECK-NEXT:    flat_store_byte v[10:11], v12 offset:5
-; CHECK-NEXT:    flat_store_byte v[10:11], v19 offset:4
-; CHECK-NEXT:    flat_store_byte v[10:11], v23 offset:11
-; CHECK-NEXT:    flat_store_byte v[10:11], v24 offset:10
-; CHECK-NEXT:    flat_store_byte v[10:11], v25 offset:9
-; CHECK-NEXT:    flat_store_byte v[10:11], v26 offset:8
-; CHECK-NEXT:    flat_store_byte v[10:11], v22 offset:15
-; CHECK-NEXT:    flat_store_byte v[10:11], v21 offset:14
-; CHECK-NEXT:    flat_store_byte v[10:11], v20 offset:13
-; CHECK-NEXT:    flat_store_byte v[10:11], v27 offset:12
+; CHECK-NEXT:    flat_store_dwordx4 v[14:15], v[10:13]
 ; CHECK-NEXT:    s_andn2_b64 exec, exec, s[12:13]
 ; CHECK-NEXT:    s_cbranch_execnz .LBB0_14
 ; CHECK-NEXT:  .LBB0_15: ; %Flow20
@@ -251,23 +193,11 @@ define void @issue63986_reduced_expanded(i64 %idxprom) {
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:  .LBB1_8: ; %post-loop-memcpy-expansion
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    v_mov_b32_e32 v3, v2
+; CHECK-NEXT:    v_mov_b32_e32 v4, v2
+; CHECK-NEXT:    v_mov_b32_e32 v5, v2
 ; CHECK-NEXT:    s_and_b64 vcc, exec, 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:3
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:2
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:1
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:7
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:6
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:5
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:4
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:11
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:10
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:9
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:8
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:15
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:14
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:13
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:12
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:  .LBB1_9: ; %loop-memcpy-expansion2
 ; CHECK-NEXT:    s_mov_b64 vcc, vcc
 ; CHECK-NEXT:    s_cbranch_vccz .LBB1_9
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
index 0a76e169e9c385..8c28fac0d839c2 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
@@ -10,108 +10,21 @@ define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-NEXT:    v_mov_b32_e32 v1, s3
-; CHECK-NEXT:    flat_load_ubyte v4, v[0:1]
-; CHECK-NEXT:    flat_load_ubyte v5, v[0:1] offset:1
-; CHECK-NEXT:    flat_load_ubyte v6, v[0:1] offset:2
-; CHECK-NEXT:    flat_load_ubyte v7, v[0:1] offset:3
-; CHECK-NEXT:    flat_load_ubyte v8, v[0:1] offset:4
-; CHECK-NEXT:    flat_load_ubyte v9, v[0:1] offset:5
-; CHECK-NEXT:    flat_load_ubyte v10, v[0:1] offset:6
-; CHECK-NEXT:    flat_load_ubyte v11, v[0:1] offset:7
-; CHECK-NEXT:    flat_load_ubyte v12, v[0:1] offset:8
-; CHECK-NEXT:    flat_load_ubyte v13, v[0:1] offset:9
-; CHECK-NEXT:    flat_load_ubyte v14, v[0:1] offset:10
-; CHECK-NEXT:    flat_load_ubyte v15, v[0:1] offset:11
-; CHECK-NEXT:    flat_load_ubyte v16, v[0:1] offset:12
-; CHECK-NEXT:    flat_load_ubyte v17, v[0:1] offset:13
-; CHECK-NEXT:    flat_load_ubyte v18, v[0:1] offset:14
-; CHECK-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-NEXT:    v_mov_b32_e32 v12, s3
+; CHECK-NEXT:    v_mov_b32_e32 v11, s2
+; CHECK-NEXT:    flat_load_ubyte v13, v[11:12] offset:46
+; CHECK-NEXT:    flat_load_ushort v14, v[11:12] offset:44
+; CHECK-NEXT:    flat_load_dwordx3 v[8:10], v[11:12] offset:32
+; CHECK-NEXT:    flat_load_dwordx4 v[0:3], v[11:12] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[11:12]
+; CHECK-NEXT:    v_mov_b32_e32 v12, s1
+; CHECK-NEXT:    v_mov_b32_e32 v11, s0
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[2:3], v4
-; CHECK-NEXT:    flat_store_byte v[2:3], v5 offset:1
-; CHECK-NEXT:    flat_store_byte v[2:3], v6 offset:2
-; CHECK-NEXT:    flat_store_byte v[2:3], v7 offset:3
-; CHECK-NEXT:    flat_store_byte v[2:3], v8 offset:4
-; CHECK-NEXT:    flat_store_byte v[2:3], v9 offset:5
-; CHECK-NEXT:    flat_store_byte v[2:3], v10 offset:6
-; CHECK-NEXT:    flat_store_byte v[2:3], v11 offset:7
-; CHECK-NEXT:    flat_store_byte v[2:3], v12 offset:8
-; CHECK-NEXT:    flat_store_byte v[2:3], v13 offset:9
-; CHECK-NEXT:    flat_store_byte v[2:3], v14 offset:10
-; CHECK-NEXT:    flat_store_byte v[2:3], v15 offset:11
-; CHECK-NEXT:    flat_store_byte v[2:3], v16 offset:12
-; CHECK-NEXT:    flat_store_byte v[2:3], v17 offset:13
-; CHECK-NEXT:    flat_store_byte v[2:3], v18 offset:14
-; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:30
-; CHECK-NEXT:    flat_load_ubyte v5, v[0:1] offset:29
-; CHECK-NEXT:    flat_load_ubyte v6, v[0:1] offset:28
-; CHECK-NEXT:    flat_load_ubyte v7, v[0:1] offset:27
-; CHECK-NEXT:    flat_load_ubyte v8, v[0:1] offset:26
-; CHECK-NEXT:    flat_load_ubyte v9, v[0:1] offset:25
-; CHECK-NEXT:    flat_load_ubyte v10, v[0:1] offset:24
-; CHECK-NEXT:    flat_load_ubyte v11, v[0:1] offset:23
-; CHECK-NEXT:    flat_load_ubyte v12, v[0:1] offset:22
-; CHECK-NEXT:    flat_load_ubyte v13, v[0:1] offset:21
-; CHECK-NEXT:    flat_load_ubyte v14, v[0:1] offset:20
-; CHECK-NEXT:    flat_load_ubyte v15, v[0:1] offset:19
-; CHECK-NEXT:    flat_load_ubyte v16, v[0:1] offset:18
-; CHECK-NEXT:    flat_load_ubyte v17, v[0:1] offset:17
-; CHECK-NEXT:    flat_load_ubyte v18, v[0:1] offset:16
-; CHECK-NEXT:    flat_load_ubyte v19, v[0:1] offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:30
-; CHECK-NEXT:    flat_store_byte v[2:3], v5 offset:29
-; CHECK-NEXT:    flat_store_byte v[2:3], v6 offset:28
-; CHECK-NEXT:    flat_store_byte v[2:3], v7 offset:27
-; CHECK-NEXT:    flat_store_byte v[2:3], v8 offset:26
-; CHECK-NEXT:    flat_store_byte v[2:3], v9 offset:25
-; CHECK-NEXT:    flat_store_byte v[2:3], v10 offset:24
-; CHECK-NEXT:    flat_store_byte v[2:3], v11 offset:23
-; CHECK-NEXT:    flat_store_byte v[2:3], v12 offset:22
-; CHECK-NEXT:    flat_store_byte v[2:3], v13 offset:21
-; CHECK-NEXT:    flat_store_byte v[2:3], v14 offset:20
-; CHECK-NEXT:    flat_store_byte v[2:3], v15 offset:19
-; CHECK-NEXT:    flat_store_byte v[2:3], v16 offset:18
-; CHECK-NEXT:    flat_store_byte v[2:3], v17 offset:17
-; CHECK-NEXT:    flat_store_byte v[2:3], v18 offset:16
-; CHECK-NEXT:    flat_store_byte v[2:3], v19 offset:15
-; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:46
-; CHECK-NEXT:    flat_load_ubyte v5, v[0:1] offset:45
-; CHECK-NEXT:    flat_load_ubyte v6, v[0:1] offset:44
-; CHECK-NEXT:    flat_load_ubyte v7, v[0:1] offset:43
-; CHECK-NEXT:    flat_load_ubyte v8, v[0:1] offset:42
-; CHECK-NEXT:    flat_load_ubyte v9, v[0:1] offset:41
-; CHECK-NEXT:    flat_load_ubyte v10, v[0:1] offset:40
-; CHECK-NEXT:    flat_load_ubyte v11, v[0:1] offset:39
-; CHECK-NEXT:    flat_load_ubyte v12, v[0:1] offset:38
-; CHECK-NEXT:    flat_load_ubyte v13, v[0:1] offset:37
-; CHECK-NEXT:    flat_load_ubyte v14, v[0:1] offset:36
-; CHECK-NEXT:    flat_load_ubyte v15, v[0:1] offset:35
-; CHECK-NEXT:    flat_load_ubyte v16, v[0:1] offset:34
-; CHECK-NEXT:    flat_load_ubyte v17, v[0:1] offset:33
-; CHECK-NEXT:    flat_load_ubyte v18, v[0:1] offset:32
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_load_ubyte v0, v[0:1] offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:46
-; CHECK-NEXT:    flat_store_byte v[2:3], v5 offset:45
-; CHECK-NEXT:    flat_store_byte v[2:3], v6 offset:44
-; CHECK-NEXT:    flat_store_byte v[2:3], v7 offset:43
-; CHECK-NEXT:    flat_store_byte v[2:3], v8 offset:42
-; CHECK-NEXT:    flat_store_byte v[2:3], v9 offset:41
-; CHECK-NEXT:    flat_store_byte v[2:3], v10 offset:40
-; CHECK-NEXT:    flat_store_byte v[2:3], v11 offset:39
-; CHECK-NEXT:    flat_store_byte v[2:3], v12 offset:38
-; CHECK-NEXT:    flat_store_byte v[2:3], v13 offset:37
-; CHECK-NEXT:    flat_store_byte v[2:3], v14 offset:36
-; CHECK-NEXT:    flat_store_byte v[2:3], v15 offset:35
-; CHECK-NEXT:    flat_store_byte v[2:3], v16 offset:34
-; CHECK-NEXT:    flat_store_byte v[2:3], v17 offset:33
-; CHECK-NEXT:    flat_store_byte v[2:3], v18 offset:32
-; CHECK-NEXT:    flat_store_byte v[2:3], v0 offset:31
+; CHECK-NEXT:    flat_store_byte v[11:12], v13 offset:46
+; CHECK-NEXT:    flat_store_short v[11:12], v14 offset:44
+; CHECK-NEXT:    flat_store_dwordx3 v[11:12], v[8:10] offset:32
+; CHECK-NEXT:    flat_store_dwordx4 v[11:12], v[0:3] offset:16
+; CHECK-NEXT:    flat_store_dwordx4 v[11:12], v[4:7]
 ; CHECK-NEXT:    s_endpgm
 entry:
   tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false)
@@ -185,375 +98,59 @@ define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr add
 ; CHECK-NEXT:    s_mov_b64 s[16:17], s[0:1]
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x8
 ; CHECK-NEXT:    s_load_dword s2, s[6:7], 0x0
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_mov_b32_e32 v24, 0
 ; CHECK-NEXT:    s_add_u32 s16, s16, s13
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:15
-; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:14
-; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:13
-; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:12
-; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:11
-; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:10
-; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:9
-; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:8
-; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:7
-; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:6
-; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:5
-; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:4
-; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:3
-; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:2
-; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:1
-; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1]
-; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:31
-; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:30
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v24, s[0:1] offset:112
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v24, s[0:1] offset:96
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v24, s[0:1] offset:80
+; CHECK-NEXT:    global_load_dwordx4 v[12:15], v24, s[0:1] offset:64
+; CHECK-NEXT:    global_load_dwordx4 v[16:19], v24, s[0:1] offset:48
+; CHECK-NEXT:    global_load_dwordx4 v[20:23], v24, s[0:1] offset:32
 ; CHECK-NEXT:    s_addc_u32 s17, s17, 0
-; CHECK-NEXT:    v_mov_b32_e32 v1, s2
-; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:10
-; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:23
+; CHECK-NEXT:    v_mov_b32_e32 v25, s2
+; CHECK-NEXT:    s_waitcnt vmcnt(5)
+; CHECK-NEXT:    buffer_store_dword v3, v25, s[16:19], 0 offen offset:124
+; CHECK-NEXT:    buffer_store_dword v2, v25, s[16:19], 0 offen offset:120
+; CHECK-NEXT:    buffer_store_dword v1, v25, s[16:19], 0 offen offset:116
+; CHECK-NEXT:    buffer_store_dword v0, v25, s[16:19], 0 offen offset:112
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v24, s[0:1] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(9)
+; CHECK-NEXT:    buffer_store_dword v7, v25, s[16:19], 0 offen offset:108
+; CHECK-NEXT:    buffer_store_dword v6, v25, s[16:19], 0 offen offset:104
+; CHECK-NEXT:    buffer_store_dword v5, v25, s[16:19], 0 offen offset:100
+; CHECK-NEXT:    buffer_store_dword v4, v25, s[16:19], 0 offen offset:96
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v24, s[0:1]
+; CHECK-NEXT:    s_waitcnt vmcnt(13)
+; CHECK-NEXT:    buffer_store_dword v11, v25, s[16:19], 0 offen offset:92
+; CHECK-NEXT:    buffer_store_dword v10, v25, s[16:19], 0 offen offset:88
+; CHECK-NEXT:    buffer_store_dword v9, v25, s[16:19], 0 offen offset:84
+; CHECK-NEXT:    buffer_store_dword v8, v25, s[16:19], 0 offen offset:80
+; CHECK-NEXT:    s_waitcnt vmcnt(16)
+; CHECK-NEXT:    buffer_store_dword v15, v25, s[16:19], 0 offen offset:76
+; CHECK-NEXT:    buffer_store_dword v14, v25, s[16:19], 0 offen offset:72
+; CHECK-NEXT:    buffer_store_dword v13, v25, s[16:19], 0 offen offset:68
+; CHECK-NEXT:    buffer_store_dword v12, v25, s[16:19], 0 offen offset:64
 ; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:9
-; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:8
-; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:7
-; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:20
+; CHECK-NEXT:    buffer_store_dword v19, v25, s[16:19], 0 offen offset:60
+; CHECK-NEXT:    buffer_store_dword v18, v25, s[16:19], 0 offen offset:56
+; CHECK-NEXT:    buffer_store_dword v17, v25, s[16:19], 0 offen offset:52
+; CHECK-NEXT:    buffer_store_dword v16, v25, s[16:19], 0 offen offset:48
 ; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:6
-; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:5
-; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:2
-; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:47
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:30
-; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:4
-; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:17
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:3
-; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:16
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:27
-; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:26
-; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:25
-; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:24
-; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:45
-; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:44
-; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:43
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:23
-; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:36
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:22
-; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:35
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:21
-; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:34
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:20
-; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:33
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:19
-; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:32
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:28
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:29
-; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:42
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:18
-; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:63
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:16
-; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:61
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:27
-; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:40
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:26
-; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:39
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:25
-; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:38
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:24
-; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:37
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:44
-; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:57
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:43
-; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:56
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen offset:45
-; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:58
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:36
-; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:49
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:35
-; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:48
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:46
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:47
-; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:60
-; CHECK-NEXT:    s_waitcnt vmcnt(33)
-; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:34
-; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:79
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:28
-; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:41
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:42
-; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:55
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:33
-; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:32
-; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:77
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:61
-; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:74
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:40
-; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:53
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:39
-; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:52
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:38
-; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:51
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:37
-; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:50
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:57
-; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:70
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:56
-; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:69
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen offset:58
-; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:71
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:49
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:48
-; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:93
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:46
-; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:59
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:60
-; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:73
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:41
-; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:54
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:55
-; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:68
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:74
-; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:87
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:53
-; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:66
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:52
-; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:65
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:51
-; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:64
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:62
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:63
-; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:76
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:50
-; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:95
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:77
-; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:90
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen offset:71
-; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:83
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:70
-; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:69
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:59
-; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:72
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:73
-; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:85
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:54
-; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:67
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:68
-; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:81
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:66
-; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:111
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:65
-; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:110
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:64
-; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:109
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:62
-; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:75
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:76
-; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:89
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:90
-; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:103
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:72
-; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:86
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:84
-; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:82
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:87
-; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:100
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:67
-; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:80
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:78
-; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:94
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:79
-; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:92
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:95
-; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:108
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:93
-; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:106
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:75
-; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:88
+; CHECK-NEXT:    buffer_store_dword v23, v25, s[16:19], 0 offen offset:44
+; CHECK-NEXT:    buffer_store_dword v22, v25, s[16:19], 0 offen offset:40
+; CHECK-NEXT:    buffer_store_dword v21, v25, s[16:19], 0 offen offset:36
+; CHECK-NEXT:    buffer_store_dword v20, v25, s[16:19], 0 offen offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(21)
+; CHECK-NEXT:    buffer_store_dword v3, v25, s[16:19], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v2, v25, s[16:19], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v1, v25, s[16:19], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v0, v25, s[16:19], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:89
-; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:102
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:78
-; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:91
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:94
-; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:107
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:92
-; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:105
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:88
-; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:101
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:91
-; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:104
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:86
-; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:85
-; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:84
-; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen offset:83
-; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:82
-; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:96
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:97
-; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:98
-; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:99
-; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:120
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:81
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:80
-; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:111
-; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:110
-; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:109
-; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:108
-; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:100
-; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:121
-; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:122
-; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:123
-; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:124
-; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:125
-; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:126
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:107
-; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:127
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:106
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:105
-; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:103
-; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:102
-; CHECK-NEXT:    s_waitcnt vmcnt(31)
-; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:101
-; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:116
-; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:117
-; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:119
-; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:114
-; CHECK-NEXT:    s_waitcnt vmcnt(34)
-; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:104
-; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:118
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:115
-; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:113
-; CHECK-NEXT:    global_load_ubyte v21, v0, s[0:1] offset:112
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:99
-; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen offset:98
-; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:97
-; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:96
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:127
-; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:126
-; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:125
-; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:124
-; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:123
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:122
-; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:121
-; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:120
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:119
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:118
-; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:117
-; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:116
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:115
-; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:114
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:113
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v21, v1, s[16:19], 0 offen offset:112
+; CHECK-NEXT:    buffer_store_dword v7, v25, s[16:19], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v6, v25, s[16:19], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v5, v25, s[16:19], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v4, v25, s[16:19], 0 offen
 ; CHECK-NEXT:    s_endpgm
 entry:
   tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false)
@@ -569,363 +166,57 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    s_add_u32 s16, s16, s13
 ; CHECK-NEXT:    s_addc_u32 s17, s17, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v2, s0
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:2
+; CHECK-NEXT:    v_mov_b32_e32 v26, s0
+; CHECK-NEXT:    buffer_load_dword v3, v26, s[16:19], 0 offen offset:124
+; CHECK-NEXT:    buffer_load_dword v2, v26, s[16:19], 0 offen offset:120
+; CHECK-NEXT:    buffer_load_dword v1, v26, s[16:19], 0 offen offset:116
+; CHECK-NEXT:    buffer_load_dword v0, v26, s[16:19], 0 offen offset:112
+; CHECK-NEXT:    buffer_load_dword v7, v26, s[16:19], 0 offen offset:108
+; CHECK-NEXT:    buffer_load_dword v6, v26, s[16:19], 0 offen offset:104
+; CHECK-NEXT:    buffer_load_dword v5, v26, s[16:19], 0 offen offset:100
+; CHECK-NEXT:    buffer_load_dword v4, v26, s[16:19], 0 offen offset:96
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:31
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-NEXT:    v_mov_b32_e32 v1, s1
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:13
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:12
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:11
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:10
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:9
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:23
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:8
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:22
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:7
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:21
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:6
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:20
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:5
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:19
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:4
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:18
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:3
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:17
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:1
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:47
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v18
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:31
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:2
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:16
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:45
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:23
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:37
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:22
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:36
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:21
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:35
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:20
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:34
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:19
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:33
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:18
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:32
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:29
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:30
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:44
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:17
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:63
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:16
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:28
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:42
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:26
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:40
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:25
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:39
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:24
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:38
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:27
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:41
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:45
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:59
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:37
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:51
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:36
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:50
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:35
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:49
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:34
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:48
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:46
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:47
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:61
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:29
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:43
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:44
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:58
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:33
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:79
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:32
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:42
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:56
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:40
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:54
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:39
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:53
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:38
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:52
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:41
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:55
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:59
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:73
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:51
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:65
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:50
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:64
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:62
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:63
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:77
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:46
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:60
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:61
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:75
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:43
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:57
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:58
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:72
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:49
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:95
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:48
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:56
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:70
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:54
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:68
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:53
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:67
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:52
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:66
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:55
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:69
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:73
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:87
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:65
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:111
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:64
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:110
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:62
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:76
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:77
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:91
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:60
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:74
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:75
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:89
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:57
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:71
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:72
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:86
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:70
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:84
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:68
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:83
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:67
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:81
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:66
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:80
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:78
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:79
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:93
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:69
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:82
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:87
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:101
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:76
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:90
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:91
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:105
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:74
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:88
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:89
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:103
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:71
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:85
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:86
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:100
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:78
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:92
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:93
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:107
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:90
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:104
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:88
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:102
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:85
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:99
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:94
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:95
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:109
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:92
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:106
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:94
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:108
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:84
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:83
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:82
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:81
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:96
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:97
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:98
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:120
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:80
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:111
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:110
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:109
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:99
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:121
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:122
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:123
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:124
+; CHECK-NEXT:    buffer_load_dword v8, v26, s[16:19], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v9, v26, s[16:19], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v10, v26, s[16:19], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v11, v26, s[16:19], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v12, v26, s[16:19], 0 offen offset:32
+; CHECK-NEXT:    buffer_load_dword v13, v26, s[16:19], 0 offen offset:36
+; CHECK-NEXT:    buffer_load_dword v14, v26, s[16:19], 0 offen offset:40
+; CHECK-NEXT:    buffer_load_dword v15, v26, s[16:19], 0 offen offset:44
+; CHECK-NEXT:    buffer_load_dword v16, v26, s[16:19], 0 offen offset:48
+; CHECK-NEXT:    buffer_load_dword v17, v26, s[16:19], 0 offen offset:52
+; CHECK-NEXT:    buffer_load_dword v18, v26, s[16:19], 0 offen offset:56
+; CHECK-NEXT:    buffer_load_dword v19, v26, s[16:19], 0 offen offset:60
+; CHECK-NEXT:    buffer_load_dword v23, v26, s[16:19], 0 offen offset:92
+; CHECK-NEXT:    buffer_load_dword v22, v26, s[16:19], 0 offen offset:88
+; CHECK-NEXT:    buffer_load_dword v21, v26, s[16:19], 0 offen offset:84
+; CHECK-NEXT:    buffer_load_dword v20, v26, s[16:19], 0 offen offset:80
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v25, s1
+; CHECK-NEXT:    v_mov_b32_e32 v24, s0
+; CHECK-NEXT:    s_waitcnt vmcnt(20)
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[0:3] offset:112
+; CHECK-NEXT:    buffer_load_dword v3, v26, s[16:19], 0 offen offset:76
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:107
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:105
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:104
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:103
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:106
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:102
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:101
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:100
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:126
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:116
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:117
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:118
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:119
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:127
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:114
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:115
+; CHECK-NEXT:    buffer_load_dword v2, v26, s[16:19], 0 offen offset:72
+; CHECK-NEXT:    buffer_load_dword v1, v26, s[16:19], 0 offen offset:68
+; CHECK-NEXT:    buffer_load_dword v0, v26, s[16:19], 0 offen offset:64
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:108
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:125
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:113
-; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[16:19], 0 offen offset:112
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:98
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:97
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:96
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:127
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:126
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[4:7] offset:96
+; CHECK-NEXT:    buffer_load_dword v4, v26, s[16:19], 0 offen
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    buffer_load_dword v5, v26, s[16:19], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v6, v26, s[16:19], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v7, v26, s[16:19], 0 offen offset:12
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[20:23] offset:80
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[0:3] offset:64
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[16:19] offset:48
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[12:15] offset:32
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[8:11] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:125
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:124
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:123
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:122
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:121
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:120
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:119
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:118
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:117
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:116
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:115
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:114
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:113
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:112
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[4:7]
 ; CHECK-NEXT:    s_endpgm
 entry:
   tail call void @llvm.memcpy.p0.p5.i64(ptr %generic, ptr addrspace(5) %src, i64 128, i1 false)
@@ -972,279 +263,27 @@ define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 {
 ; CHECK-LABEL: memcpy_p0_p3_minsize:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:112
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:113
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:114
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:115
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:116
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:117
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:118
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:119
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-NEXT:    v_mov_b32_e32 v1, s1
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:112
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:113
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:114
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:115
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:116
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:117
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:118
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:119
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:120
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:121
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:122
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:123
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:124
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:125
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:126
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:127
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:120
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:121
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:122
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:123
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:124
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:125
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:126
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:127
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:96
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:97
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:98
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:99
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:100
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:101
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:102
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:103
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:96
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:97
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:98
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:99
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:100
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:101
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:102
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:103
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:104
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:105
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:106
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:107
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:108
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:109
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:110
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:111
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:104
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:105
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:106
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:107
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:108
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:109
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:110
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:111
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:80
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:81
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:82
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:83
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:84
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:85
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:86
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:87
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:80
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:81
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:82
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:83
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:84
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:85
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:86
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:87
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:88
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:89
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:90
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:91
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:92
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:93
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:94
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:95
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:88
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:89
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:90
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:91
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:92
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:93
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:94
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:95
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:64
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:65
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:66
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:67
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:68
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:69
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:70
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:71
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:64
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:65
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:66
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:67
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:68
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:69
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:70
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:71
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:72
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:73
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:74
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:75
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:76
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:77
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:78
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:79
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:72
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:73
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:74
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:75
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:76
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:77
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:78
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:79
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:48
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:49
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:50
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:51
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:52
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:53
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:54
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:55
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:48
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:49
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:50
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:51
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:52
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:53
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:54
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:55
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:56
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:57
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:58
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:59
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:60
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:61
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:62
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:63
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:56
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:57
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:58
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:59
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:60
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:61
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:62
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:63
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:32
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:33
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:34
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:35
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:36
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:37
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:38
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:39
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:32
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:33
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:34
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:35
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:36
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:37
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:38
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:39
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:40
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:41
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:42
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:43
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:44
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:45
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:46
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:47
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:40
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:41
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:42
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:43
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:44
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:45
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:46
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:47
-; CHECK-NEXT:    ds_read_u8 v3, v2
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:1
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:2
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:3
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:4
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:5
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:6
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:7
-; CHECK-NEXT:    ds_read_u8 v11, v2 offset:8
-; CHECK-NEXT:    ds_read_u8 v12, v2 offset:9
-; CHECK-NEXT:    ds_read_u8 v13, v2 offset:10
-; CHECK-NEXT:    ds_read_u8 v14, v2 offset:11
-; CHECK-NEXT:    ds_read_u8 v15, v2 offset:12
-; CHECK-NEXT:    ds_read_u8 v16, v2 offset:13
-; CHECK-NEXT:    ds_read_u8 v17, v2 offset:14
-; CHECK-NEXT:    ds_read_u8 v18, v2 offset:15
-; CHECK-NEXT:    ds_read_u8 v19, v2 offset:16
-; CHECK-NEXT:    ds_read_u8 v20, v2 offset:17
-; CHECK-NEXT:    ds_read_u8 v21, v2 offset:18
-; CHECK-NEXT:    ds_read_u8 v22, v2 offset:19
-; CHECK-NEXT:    ds_read_u8 v23, v2 offset:20
-; CHECK-NEXT:    ds_read_u8 v24, v2 offset:21
-; CHECK-NEXT:    ds_read_u8 v25, v2 offset:22
-; CHECK-NEXT:    ds_read_u8 v26, v2 offset:23
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:16
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:17
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:19
-; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:20
-; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:21
-; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:22
-; CHECK-NEXT:    flat_store_byte v[0:1], v26 offset:23
-; CHECK-NEXT:    ds_read_u8 v19, v2 offset:24
-; CHECK-NEXT:    ds_read_u8 v20, v2 offset:25
-; CHECK-NEXT:    ds_read_u8 v21, v2 offset:26
-; CHECK-NEXT:    ds_read_u8 v22, v2 offset:27
-; CHECK-NEXT:    ds_read_u8 v23, v2 offset:28
-; CHECK-NEXT:    ds_read_u8 v24, v2 offset:29
-; CHECK-NEXT:    ds_read_u8 v25, v2 offset:30
-; CHECK-NEXT:    ds_read_u8 v2, v2 offset:31
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:24
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:25
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:26
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:27
-; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:28
-; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:29
-; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:30
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:31
-; CHECK-NEXT:    flat_store_byte v[0:1], v3
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:1
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:2
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:3
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:4
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:5
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:6
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:7
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:9
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:10
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:11
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:12
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:13
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:14
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:15
+; CHECK-NEXT:    v_mov_b32_e32 v16, 0
+; CHECK-NEXT:    ds_read2_b64 v[0:3], v16 offset1:1
+; CHECK-NEXT:    ds_read2_b64 v[4:7], v16 offset0:2 offset1:3
+; CHECK-NEXT:    ds_read2_b64 v[8:11], v16 offset0:4 offset1:5
+; CHECK-NEXT:    ds_read2_b64 v[12:15], v16 offset0:6 offset1:7
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v21, s1
+; CHECK-NEXT:    v_mov_b32_e32 v20, s0
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[0:3]
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[4:7] offset:16
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[8:11] offset:32
+; CHECK-NEXT:    ds_read2_b64 v[0:3], v16 offset0:8 offset1:9
+; CHECK-NEXT:    ds_read2_b64 v[4:7], v16 offset0:10 offset1:11
+; CHECK-NEXT:    ds_read2_b64 v[8:11], v16 offset0:12 offset1:13
+; CHECK-NEXT:    ds_read2_b64 v[16:19], v16 offset0:14 offset1:15
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[12:15] offset:48
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[0:3] offset:64
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[4:7] offset:80
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[8:11] offset:96
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[16:19] offset:112
 ; CHECK-NEXT:    s_endpgm
 entry:
   tail call void @llvm.memcpy.p0.p3.i64(ptr %generic, ptr addrspace(3) @shared, i64 128, i1 false)
@@ -1256,108 +295,21 @@ define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 {
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-NEXT:    v_mov_b32_e32 v1, s3
-; CHECK-NEXT:    flat_load_ubyte v4, v[0:1]
-; CHECK-NEXT:    flat_load_ubyte v5, v[0:1] offset:1
-; CHECK-NEXT:    flat_load_ubyte v6, v[0:1] offset:2
-; CHECK-NEXT:    flat_load_ubyte v7, v[0:1] offset:3
-; CHECK-NEXT:    flat_load_ubyte v8, v[0:1] offset:4
-; CHECK-NEXT:    flat_load_ubyte v9, v[0:1] offset:5
-; CHECK-NEXT:    flat_load_ubyte v10, v[0:1] offset:6
-; CHECK-NEXT:    flat_load_ubyte v11, v[0:1] offset:7
-; CHECK-NEXT:    flat_load_ubyte v12, v[0:1] offset:8
-; CHECK-NEXT:    flat_load_ubyte v13, v[0:1] offset:9
-; CHECK-NEXT:    flat_load_ubyte v14, v[0:1] offset:10
-; CHECK-NEXT:    flat_load_ubyte v15, v[0:1] offset:11
-; CHECK-NEXT:    flat_load_ubyte v16, v[0:1] offset:12
-; CHECK-NEXT:    flat_load_ubyte v17, v[0:1] offset:13
-; CHECK-NEXT:    flat_load_ubyte v18, v[0:1] offset:14
-; CHECK-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-NEXT:    v_mov_b32_e32 v12, s3
+; CHECK-NEXT:    v_mov_b32_e32 v11, s2
+; CHECK-NEXT:    flat_load_ubyte v13, v[11:12] offset:46
+; CHECK-NEXT:    flat_load_ushort v14, v[11:12] offset:44
+; CHECK-NEXT:    flat_load_dwordx3 v[8:10], v[11:12] offset:32
+; CHECK-NEXT:    flat_load_dwordx4 v[0:3], v[11:12] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[11:12]
+; CHECK-NEXT:    v_mov_b32_e32 v12, s1
+; CHECK-NEXT:    v_mov_b32_e32 v11, s0
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[2:3], v4
-; CHECK-NEXT:    flat_store_byte v[2:3], v5 offset:1
-; CHECK-NEXT:    flat_store_byte v[2:3], v6 offset:2
-; CHECK-NEXT:    flat_store_byte v[2:3], v7 offset:3
-; CHECK-NEXT:    flat_store_byte v[2:3], v8 offset:4
-; CHECK-NEXT:    flat_store_byte v[2:3], v9 offset:5
-; CHECK-NEXT:    flat_store_byte v[2:3], v10 offset:6
-; CHECK-NEXT:    flat_store_byte v[2:3], v11 offset:7
-; CHECK-NEXT:    flat_store_byte v[2:3], v12 offset:8
-; CHECK-NEXT:    flat_store_byte v[2:3], v13 offset:9
-; CHECK-NEXT:    flat_store_byte v[2:3], v14 offset:10
-; CHECK-NEXT:    flat_store_byte v[2:3], v15 offset:11
-; CHECK-NEXT:    flat_store_byte v[2:3], v16 offset:12
-; CHECK-NEXT:    flat_store_byte v[2:3], v17 offset:13
-; CHECK-NEXT:    flat_store_byte v[2:3], v18 offset:14
-; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:30
-; CHECK-NEXT:    flat_load_ubyte v5, v[0:1] offset:29
-; CHECK-NEXT:    flat_load_ubyte v6, v[0:1] offset:28
-; CHECK-NEXT:    flat_load_ubyte v7, v[0:1] offset:27
-; CHECK-NEXT:    flat_load_ubyte v8, v[0:1] offset:26
-; CHECK-NEXT:    flat_load_ubyte v9, v[0:1] offset:25
-; CHECK-NEXT:    flat_load_ubyte v10, v[0:1] offset:24
-; CHECK-NEXT:    flat_load_ubyte v11, v[0:1] offset:23
-; CHECK-NEXT:    flat_load_ubyte v12, v[0:1] offset:22
-; CHECK-NEXT:    flat_load_ubyte v13, v[0:1] offset:21
-; CHECK-NEXT:    flat_load_ubyte v14, v[0:1] offset:20
-; CHECK-NEXT:    flat_load_ubyte v15, v[0:1] offset:19
-; CHECK-NEXT:    flat_load_ubyte v16, v[0:1] offset:18
-; CHECK-NEXT:    flat_load_ubyte v17, v[0:1] offset:17
-; CHECK-NEXT:    flat_load_ubyte v18, v[0:1] offset:16
-; CHECK-NEXT:    flat_load_ubyte v19, v[0:1] offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:30
-; CHECK-NEXT:    flat_store_byte v[2:3], v5 offset:29
-; CHECK-NEXT:    flat_store_byte v[2:3], v6 offset:28
-; CHECK-NEXT:    flat_store_byte v[2:3], v7 offset:27
-; CHECK-NEXT:    flat_store_byte v[2:3], v8 offset:26
-; CHECK-NEXT:    flat_store_byte v[2:3], v9 offset:25
-; CHECK-NEXT:    flat_store_byte v[2:3], v10 offset:24
-; CHECK-NEXT:    flat_store_byte v[2:3], v11 offset:23
-; CHECK-NEXT:    flat_store_byte v[2:3], v12 offset:22
-; CHECK-NEXT:    flat_store_byte v[2:3], v13 offset:21
-; CHECK-NEXT:    flat_store_byte v[2:3], v14 offset:20
-; CHECK-NEXT:    flat_store_byte v[2:3], v15 offset:19
-; CHECK-NEXT:    flat_store_byte v[2:3], v16 offset:18
-; CHECK-NEXT:    flat_store_byte v[2:3], v17 offset:17
-; CHECK-NEXT:    flat_store_byte v[2:3], v18 offset:16
-; CHECK-NEXT:    flat_store_byte v[2:3], v19 offset:15
-; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:46
-; CHECK-NEXT:    flat_load_ubyte v5, v[0:1] offset:45
-; CHECK-NEXT:    flat_load_ubyte v6, v[0:1] offset:44
-; CHECK-NEXT:    flat_load_ubyte v7, v[0:1] offset:43
-; CHECK-NEXT:    flat_load_ubyte v8, v[0:1] offset:42
-; CHECK-NEXT:    flat_load_ubyte v9, v[0:1] offset:41
-; CHECK-NEXT:    flat_load_ubyte v10, v[0:1] offset:40
-; CHECK-NEXT:    flat_load_ubyte v11, v[0:1] offset:39
-; CHECK-NEXT:    flat_load_ubyte v12, v[0:1] offset:38
-; CHECK-NEXT:    flat_load_ubyte v13, v[0:1] offset:37
-; CHECK-NEXT:    flat_load_ubyte v14, v[0:1] offset:36
-; CHECK-NEXT:    flat_load_ubyte v15, v[0:1] offset:35
-; CHECK-NEXT:    flat_load_ubyte v16, v[0:1] offset:34
-; CHECK-NEXT:    flat_load_ubyte v17, v[0:1] offset:33
-; CHECK-NEXT:    flat_load_ubyte v18, v[0:1] offset:32
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_load_ubyte v0, v[0:1] offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:46
-; CHECK-NEXT:    flat_store_byte v[2:3], v5 offset:45
-; CHECK-NEXT:    flat_store_byte v[2:3], v6 offset:44
-; CHECK-NEXT:    flat_store_byte v[2:3], v7 offset:43
-; CHECK-NEXT:    flat_store_byte v[2:3], v8 offset:42
-; CHECK-NEXT:    flat_store_byte v[2:3], v9 offset:41
-; CHECK-NEXT:    flat_store_byte v[2:3], v10 offset:40
-; CHECK-NEXT:    flat_store_byte v[2:3], v11 offset:39
-; CHECK-NEXT:    flat_store_byte v[2:3], v12 offset:38
-; CHECK-NEXT:    flat_store_byte v[2:3], v13 offset:37
-; CHECK-NEXT:    flat_store_byte v[2:3], v14 offset:36
-; CHECK-NEXT:    flat_store_byte v[2:3], v15 offset:35
-; CHECK-NEXT:    flat_store_byte v[2:3], v16 offset:34
-; CHECK-NEXT:    flat_store_byte v[2:3], v17 offset:33
-; CHECK-NEXT:    flat_store_byte v[2:3], v18 offset:32
-; CHECK-NEXT:    flat_store_byte v[2:3], v0 offset:31
+; CHECK-NEXT:    flat_store_byte v[11:12], v13 offset:46
+; CHECK-NEXT:    flat_store_short v[11:12], v14 offset:44
+; CHECK-NEXT:    flat_store_dwordx3 v[11:12], v[8:10] offset:32
+; CHECK-NEXT:    flat_store_dwordx4 v[11:12], v[0:3] offset:16
+; CHECK-NEXT:    flat_store_dwordx4 v[11:12], v[4:7]
 ; CHECK-NEXT:    s_endpgm
 entry:
   tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false)
@@ -1431,375 +383,59 @@ define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr add
 ; CHECK-NEXT:    s_mov_b64 s[16:17], s[0:1]
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x8
 ; CHECK-NEXT:    s_load_dword s2, s[6:7], 0x0
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_mov_b32_e32 v24, 0
 ; CHECK-NEXT:    s_add_u32 s16, s16, s13
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:15
-; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:14
-; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:13
-; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:12
-; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:11
-; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:10
-; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:9
-; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:8
-; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:7
-; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:6
-; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:5
-; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:4
-; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:3
-; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:2
-; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:1
-; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1]
-; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:31
-; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:30
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v24, s[0:1] offset:112
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v24, s[0:1] offset:96
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v24, s[0:1] offset:80
+; CHECK-NEXT:    global_load_dwordx4 v[12:15], v24, s[0:1] offset:64
+; CHECK-NEXT:    global_load_dwordx4 v[16:19], v24, s[0:1] offset:48
+; CHECK-NEXT:    global_load_dwordx4 v[20:23], v24, s[0:1] offset:32
 ; CHECK-NEXT:    s_addc_u32 s17, s17, 0
-; CHECK-NEXT:    v_mov_b32_e32 v1, s2
-; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:10
-; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:23
+; CHECK-NEXT:    v_mov_b32_e32 v25, s2
+; CHECK-NEXT:    s_waitcnt vmcnt(5)
+; CHECK-NEXT:    buffer_store_dword v3, v25, s[16:19], 0 offen offset:124
+; CHECK-NEXT:    buffer_store_dword v2, v25, s[16:19], 0 offen offset:120
+; CHECK-NEXT:    buffer_store_dword v1, v25, s[16:19], 0 offen offset:116
+; CHECK-NEXT:    buffer_store_dword v0, v25, s[16:19], 0 offen offset:112
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v24, s[0:1] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(9)
+; CHECK-NEXT:    buffer_store_dword v7, v25, s[16:19], 0 offen offset:108
+; CHECK-NEXT:    buffer_store_dword v6, v25, s[16:19], 0 offen offset:104
+; CHECK-NEXT:    buffer_store_dword v5, v25, s[16:19], 0 offen offset:100
+; CHECK-NEXT:    buffer_store_dword v4, v25, s[16:19], 0 offen offset:96
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v24, s[0:1]
+; CHECK-NEXT:    s_waitcnt vmcnt(13)
+; CHECK-NEXT:    buffer_store_dword v11, v25, s[16:19], 0 offen offset:92
+; CHECK-NEXT:    buffer_store_dword v10, v25, s[16:19], 0 offen offset:88
+; CHECK-NEXT:    buffer_store_dword v9, v25, s[16:19], 0 offen offset:84
+; CHECK-NEXT:    buffer_store_dword v8, v25, s[16:19], 0 offen offset:80
+; CHECK-NEXT:    s_waitcnt vmcnt(16)
+; CHECK-NEXT:    buffer_store_dword v15, v25, s[16:19], 0 offen offset:76
+; CHECK-NEXT:    buffer_store_dword v14, v25, s[16:19], 0 offen offset:72
+; CHECK-NEXT:    buffer_store_dword v13, v25, s[16:19], 0 offen offset:68
+; CHECK-NEXT:    buffer_store_dword v12, v25, s[16:19], 0 offen offset:64
 ; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:9
-; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:8
-; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:7
-; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:20
+; CHECK-NEXT:    buffer_store_dword v19, v25, s[16:19], 0 offen offset:60
+; CHECK-NEXT:    buffer_store_dword v18, v25, s[16:19], 0 offen offset:56
+; CHECK-NEXT:    buffer_store_dword v17, v25, s[16:19], 0 offen offset:52
+; CHECK-NEXT:    buffer_store_dword v16, v25, s[16:19], 0 offen offset:48
 ; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:6
-; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:5
-; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:2
-; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:47
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:30
-; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:4
-; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:17
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:3
-; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:16
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:27
-; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:26
-; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:25
-; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:24
-; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:45
-; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:44
-; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:43
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:23
-; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:36
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:22
-; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:35
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:21
-; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:34
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:20
-; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:33
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:19
-; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:32
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:28
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:29
-; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:42
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:18
-; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:63
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:16
-; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:61
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:27
-; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:40
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:26
-; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:39
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:25
-; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:38
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:24
-; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:37
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:44
-; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:57
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:43
-; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:56
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen offset:45
-; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:58
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:36
-; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:49
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:35
-; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:48
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:46
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:47
-; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:60
-; CHECK-NEXT:    s_waitcnt vmcnt(33)
-; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:34
-; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:79
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:28
-; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:41
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:42
-; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:55
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:33
-; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:32
-; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:77
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:61
-; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:74
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:40
-; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:53
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:39
-; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:52
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:38
-; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:51
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:37
-; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:50
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:57
-; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:70
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:56
-; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:69
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen offset:58
-; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:71
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:49
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:48
-; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:93
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:46
-; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:59
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:60
-; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:73
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:41
-; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:54
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:55
-; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:68
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:74
-; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:87
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:53
-; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:66
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:52
-; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:65
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:51
-; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:64
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:62
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:63
-; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:76
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:50
-; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:95
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:77
-; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:90
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen offset:71
-; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:83
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:70
-; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:69
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:59
-; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:72
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:73
-; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:85
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:54
-; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:67
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:68
-; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:81
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:66
-; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:111
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:65
-; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:110
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:64
-; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:109
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:62
-; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:75
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:76
-; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:89
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:90
-; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:103
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:72
-; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:86
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:84
-; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:82
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:87
-; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:100
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:67
-; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:80
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:78
-; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:94
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:79
-; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:92
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:95
-; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:108
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:93
-; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:106
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:75
-; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:88
+; CHECK-NEXT:    buffer_store_dword v23, v25, s[16:19], 0 offen offset:44
+; CHECK-NEXT:    buffer_store_dword v22, v25, s[16:19], 0 offen offset:40
+; CHECK-NEXT:    buffer_store_dword v21, v25, s[16:19], 0 offen offset:36
+; CHECK-NEXT:    buffer_store_dword v20, v25, s[16:19], 0 offen offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(21)
+; CHECK-NEXT:    buffer_store_dword v3, v25, s[16:19], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v2, v25, s[16:19], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v1, v25, s[16:19], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v0, v25, s[16:19], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:89
-; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:102
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:78
-; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:91
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:94
-; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:107
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:92
-; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:105
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:88
-; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:101
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:91
-; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:104
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:86
-; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:85
-; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:84
-; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen offset:83
-; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:82
-; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:96
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:97
-; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:98
-; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:99
-; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:120
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:81
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:80
-; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:111
-; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:110
-; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:109
-; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:108
-; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:100
-; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:121
-; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:122
-; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:123
-; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:124
-; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:125
-; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:126
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:107
-; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:127
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:106
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:105
-; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:103
-; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:102
-; CHECK-NEXT:    s_waitcnt vmcnt(31)
-; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:101
-; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:116
-; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:117
-; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:119
-; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:114
-; CHECK-NEXT:    s_waitcnt vmcnt(34)
-; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:104
-; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:118
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:115
-; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:113
-; CHECK-NEXT:    global_load_ubyte v21, v0, s[0:1] offset:112
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:99
-; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen offset:98
-; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:97
-; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:96
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:127
-; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:126
-; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:125
-; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:124
-; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:123
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:122
-; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:121
-; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:120
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:119
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:118
-; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:117
-; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:116
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:115
-; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:114
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:113
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v21, v1, s[16:19], 0 offen offset:112
+; CHECK-NEXT:    buffer_store_dword v7, v25, s[16:19], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v6, v25, s[16:19], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v5, v25, s[16:19], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v4, v25, s[16:19], 0 offen
 ; CHECK-NEXT:    s_endpgm
 entry:
   tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false)
@@ -1815,363 +451,57 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    s_add_u32 s16, s16, s13
 ; CHECK-NEXT:    s_addc_u32 s17, s17, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v2, s0
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:2
+; CHECK-NEXT:    v_mov_b32_e32 v26, s0
+; CHECK-NEXT:    buffer_load_dword v3, v26, s[16:19], 0 offen offset:124
+; CHECK-NEXT:    buffer_load_dword v2, v26, s[16:19], 0 offen offset:120
+; CHECK-NEXT:    buffer_load_dword v1, v26, s[16:19], 0 offen offset:116
+; CHECK-NEXT:    buffer_load_dword v0, v26, s[16:19], 0 offen offset:112
+; CHECK-NEXT:    buffer_load_dword v7, v26, s[16:19], 0 offen offset:108
+; CHECK-NEXT:    buffer_load_dword v6, v26, s[16:19], 0 offen offset:104
+; CHECK-NEXT:    buffer_load_dword v5, v26, s[16:19], 0 offen offset:100
+; CHECK-NEXT:    buffer_load_dword v4, v26, s[16:19], 0 offen offset:96
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:31
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-NEXT:    v_mov_b32_e32 v1, s1
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:13
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:12
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:11
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:10
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:9
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:23
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:8
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:22
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:7
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:21
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:6
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:20
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:5
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:19
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:4
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:18
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:3
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:17
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:1
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:47
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v18
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:31
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:2
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:16
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:45
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:23
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:37
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:22
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:36
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:21
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:35
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:20
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:34
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:19
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:33
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:18
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:32
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:29
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:30
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:44
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:17
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:63
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:16
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:28
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:42
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:26
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:40
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:25
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:39
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:24
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:38
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:27
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:41
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:45
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:59
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:37
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:51
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:36
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:50
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:35
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:49
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:34
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:48
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:46
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:47
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:61
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:29
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:43
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:44
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:58
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:33
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:79
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:32
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:42
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:56
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:40
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:54
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:39
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:53
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:38
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:52
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:41
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:55
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:59
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:73
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:51
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:65
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:50
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:64
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:62
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:63
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:77
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:46
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:60
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:61
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:75
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:43
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:57
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:58
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:72
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:49
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:95
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:48
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:56
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:70
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:54
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:68
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:53
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:67
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:52
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:66
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:55
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:69
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:73
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:87
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:65
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:111
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:64
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:110
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:62
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:76
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:77
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:91
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:60
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:74
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:75
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:89
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:57
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:71
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:72
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:86
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:70
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:84
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:68
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:83
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:67
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:81
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:66
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:80
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:78
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:79
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:93
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:69
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:82
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:87
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:101
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:76
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:90
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:91
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:105
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:74
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:88
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:89
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:103
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:71
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:85
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:86
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:100
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:78
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:92
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:93
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:107
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:90
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:104
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:88
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:102
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:85
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:99
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:94
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:95
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:109
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:92
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:106
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:94
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:108
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:84
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:83
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:82
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:81
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:96
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:97
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:98
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:120
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:80
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:111
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:110
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:109
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:99
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:121
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:122
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:123
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:124
+; CHECK-NEXT:    buffer_load_dword v8, v26, s[16:19], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v9, v26, s[16:19], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v10, v26, s[16:19], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v11, v26, s[16:19], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v12, v26, s[16:19], 0 offen offset:32
+; CHECK-NEXT:    buffer_load_dword v13, v26, s[16:19], 0 offen offset:36
+; CHECK-NEXT:    buffer_load_dword v14, v26, s[16:19], 0 offen offset:40
+; CHECK-NEXT:    buffer_load_dword v15, v26, s[16:19], 0 offen offset:44
+; CHECK-NEXT:    buffer_load_dword v16, v26, s[16:19], 0 offen offset:48
+; CHECK-NEXT:    buffer_load_dword v17, v26, s[16:19], 0 offen offset:52
+; CHECK-NEXT:    buffer_load_dword v18, v26, s[16:19], 0 offen offset:56
+; CHECK-NEXT:    buffer_load_dword v19, v26, s[16:19], 0 offen offset:60
+; CHECK-NEXT:    buffer_load_dword v23, v26, s[16:19], 0 offen offset:92
+; CHECK-NEXT:    buffer_load_dword v22, v26, s[16:19], 0 offen offset:88
+; CHECK-NEXT:    buffer_load_dword v21, v26, s[16:19], 0 offen offset:84
+; CHECK-NEXT:    buffer_load_dword v20, v26, s[16:19], 0 offen offset:80
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v25, s1
+; CHECK-NEXT:    v_mov_b32_e32 v24, s0
+; CHECK-NEXT:    s_waitcnt vmcnt(20)
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[0:3] offset:112
+; CHECK-NEXT:    buffer_load_dword v3, v26, s[16:19], 0 offen offset:76
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:107
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:105
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:104
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:103
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:106
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:102
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:101
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:100
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:126
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:116
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:117
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:118
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:119
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:127
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:114
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:115
+; CHECK-NEXT:    buffer_load_dword v2, v26, s[16:19], 0 offen offset:72
+; CHECK-NEXT:    buffer_load_dword v1, v26, s[16:19], 0 offen offset:68
+; CHECK-NEXT:    buffer_load_dword v0, v26, s[16:19], 0 offen offset:64
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:108
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:125
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:113
-; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[16:19], 0 offen offset:112
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:98
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:97
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:96
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:127
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:126
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[4:7] offset:96
+; CHECK-NEXT:    buffer_load_dword v4, v26, s[16:19], 0 offen
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    buffer_load_dword v5, v26, s[16:19], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v6, v26, s[16:19], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v7, v26, s[16:19], 0 offen offset:12
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[20:23] offset:80
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[0:3] offset:64
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[16:19] offset:48
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[12:15] offset:32
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[8:11] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:125
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:124
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:123
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:122
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:121
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:120
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:119
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:118
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:117
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:116
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:115
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:114
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:113
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:112
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[4:7]
 ; CHECK-NEXT:    s_endpgm
 entry:
   tail call void @llvm.memcpy.p0.p5.i64(ptr %generic, ptr addrspace(5) %src, i64 128, i1 false)
@@ -2218,279 +548,27 @@ define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 {
 ; CHECK-LABEL: memcpy_p0_p3_optsize:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:112
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:113
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:114
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:115
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:116
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:117
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:118
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:119
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-NEXT:    v_mov_b32_e32 v1, s1
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:112
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:113
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:114
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:115
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:116
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:117
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:118
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:119
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:120
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:121
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:122
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:123
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:124
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:125
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:126
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:127
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:120
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:121
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:122
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:123
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:124
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:125
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:126
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:127
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:96
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:97
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:98
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:99
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:100
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:101
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:102
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:103
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:96
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:97
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:98
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:99
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:100
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:101
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:102
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:103
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:104
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:105
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:106
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:107
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:108
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:109
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:110
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:111
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:104
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:105
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:106
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:107
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:108
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:109
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:110
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:111
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:80
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:81
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:82
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:83
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:84
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:85
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:86
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:87
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:80
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:81
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:82
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:83
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:84
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:85
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:86
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:87
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:88
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:89
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:90
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:91
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:92
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:93
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:94
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:95
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:88
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:89
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:90
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:91
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:92
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:93
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:94
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:95
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:64
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:65
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:66
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:67
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:68
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:69
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:70
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:71
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:64
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:65
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:66
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:67
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:68
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:69
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:70
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:71
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:72
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:73
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:74
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:75
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:76
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:77
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:78
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:79
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:72
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:73
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:74
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:75
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:76
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:77
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:78
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:79
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:48
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:49
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:50
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:51
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:52
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:53
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:54
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:55
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:48
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:49
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:50
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:51
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:52
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:53
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:54
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:55
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:56
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:57
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:58
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:59
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:60
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:61
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:62
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:63
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:56
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:57
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:58
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:59
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:60
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:61
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:62
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:63
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:32
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:33
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:34
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:35
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:36
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:37
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:38
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:39
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:32
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:33
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:34
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:35
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:36
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:37
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:38
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:39
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:40
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:41
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:42
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:43
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:44
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:45
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:46
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:47
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:40
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:41
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:42
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:43
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:44
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:45
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:46
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:47
-; CHECK-NEXT:    ds_read_u8 v3, v2
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:1
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:2
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:3
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:4
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:5
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:6
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:7
-; CHECK-NEXT:    ds_read_u8 v11, v2 offset:8
-; CHECK-NEXT:    ds_read_u8 v12, v2 offset:9
-; CHECK-NEXT:    ds_read_u8 v13, v2 offset:10
-; CHECK-NEXT:    ds_read_u8 v14, v2 offset:11
-; CHECK-NEXT:    ds_read_u8 v15, v2 offset:12
-; CHECK-NEXT:    ds_read_u8 v16, v2 offset:13
-; CHECK-NEXT:    ds_read_u8 v17, v2 offset:14
-; CHECK-NEXT:    ds_read_u8 v18, v2 offset:15
-; CHECK-NEXT:    ds_read_u8 v19, v2 offset:16
-; CHECK-NEXT:    ds_read_u8 v20, v2 offset:17
-; CHECK-NEXT:    ds_read_u8 v21, v2 offset:18
-; CHECK-NEXT:    ds_read_u8 v22, v2 offset:19
-; CHECK-NEXT:    ds_read_u8 v23, v2 offset:20
-; CHECK-NEXT:    ds_read_u8 v24, v2 offset:21
-; CHECK-NEXT:    ds_read_u8 v25, v2 offset:22
-; CHECK-NEXT:    ds_read_u8 v26, v2 offset:23
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:16
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:17
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:19
-; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:20
-; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:21
-; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:22
-; CHECK-NEXT:    flat_store_byte v[0:1], v26 offset:23
-; CHECK-NEXT:    ds_read_u8 v19, v2 offset:24
-; CHECK-NEXT:    ds_read_u8 v20, v2 offset:25
-; CHECK-NEXT:    ds_read_u8 v21, v2 offset:26
-; CHECK-NEXT:    ds_read_u8 v22, v2 offset:27
-; CHECK-NEXT:    ds_read_u8 v23, v2 offset:28
-; CHECK-NEXT:    ds_read_u8 v24, v2 offset:29
-; CHECK-NEXT:    ds_read_u8 v25, v2 offset:30
-; CHECK-NEXT:    ds_read_u8 v2, v2 offset:31
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:24
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:25
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:26
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:27
-; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:28
-; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:29
-; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:30
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:31
-; CHECK-NEXT:    flat_store_byte v[0:1], v3
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:1
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:2
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:3
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:4
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:5
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:6
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:7
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:9
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:10
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:11
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:12
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:13
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:14
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:15
+; CHECK-NEXT:    v_mov_b32_e32 v16, 0
+; CHECK-NEXT:    ds_read2_b64 v[0:3], v16 offset1:1
+; CHECK-NEXT:    ds_read2_b64 v[4:7], v16 offset0:2 offset1:3
+; CHECK-NEXT:    ds_read2_b64 v[8:11], v16 offset0:4 offset1:5
+; CHECK-NEXT:    ds_read2_b64 v[12:15], v16 offset0:6 offset1:7
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v21, s1
+; CHECK-NEXT:    v_mov_b32_e32 v20, s0
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[0:3]
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[4:7] offset:16
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[8:11] offset:32
+; CHECK-NEXT:    ds_read2_b64 v[0:3], v16 offset0:8 offset1:9
+; CHECK-NEXT:    ds_read2_b64 v[4:7], v16 offset0:10 offset1:11
+; CHECK-NEXT:    ds_read2_b64 v[8:11], v16 offset0:12 offset1:13
+; CHECK-NEXT:    ds_read2_b64 v[16:19], v16 offset0:14 offset1:15
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[12:15] offset:48
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[0:3] offset:64
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[4:7] offset:80
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[8:11] offset:96
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[16:19] offset:112
 ; CHECK-NEXT:    s_endpgm
 entry:
   tail call void @llvm.memcpy.p0.p3.i64(ptr %generic, ptr addrspace(3) @shared, i64 128, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll b/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll
index 7575782c1b2acd..cadc3dadb0a1e9 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll
@@ -13,55 +13,9 @@ define void @memcpy_p0_p0_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p0_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:13
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:11
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:9
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:7
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:5
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:3
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:1
-; CHECK-NEXT:    flat_load_ubyte v2, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -73,101 +27,19 @@ define void @memcpy_p0_p0_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p0_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xe
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:13
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:11
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:9
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:7
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:5
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:3
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:1
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:29
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:27
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:25
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:23
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:21
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:19
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:17
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ubyte v2, v[2:3] offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:15
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:30
+; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:28
+; CHECK-NEXT:    flat_load_dwordx3 v[6:8], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -179,104 +51,13 @@ define void @memcpy_p0_p0_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p0_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:13
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:11
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:9
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:7
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:5
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:3
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:1
-; CHECK-NEXT:    flat_load_ubyte v19, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:31
-; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:29
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:27
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:25
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:23
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:21
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:19
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:17
-; CHECK-NEXT:    flat_load_ubyte v2, v[2:3] offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:16
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -288,31 +69,9 @@ define void @memcpy_p0_p0_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p0_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    flat_load_ushort v4, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ushort v5, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ushort v6, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ushort v7, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ushort v8, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ushort v9, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ushort v2, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -324,55 +83,19 @@ define void @memcpy_p0_p0_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p0_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ushort v5, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ushort v6, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ushort v7, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ushort v8, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ushort v9, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ushort v11, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ushort v12, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ushort v13, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ushort v14, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ushort v15, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ushort v16, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ushort v17, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ushort v18, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ushort v2, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v18 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:30
+; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:28
+; CHECK-NEXT:    flat_load_dwordx3 v[6:8], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -384,55 +107,13 @@ define void @memcpy_p0_p0_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p0_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ushort v4, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ushort v5, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ushort v6, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ushort v7, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ushort v8, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ushort v9, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ushort v11, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ushort v12, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ushort v13, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ushort v14, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ushort v15, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ushort v16, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ushort v17, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ushort v18, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ushort v2, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v18 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -458,58 +139,13 @@ define void @memcpy_p0_p0_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p0_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x10
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:17
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:21
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:19
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:25
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:23
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:29
-; CHECK-NEXT:    flat_load_ubyte v19, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ubyte v20, v[2:3] offset:27
-; CHECK-NEXT:    flat_load_ubyte v21, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:15
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:15
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -553,58 +189,13 @@ define void @memcpy_p0_p0_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr a
 ; CHECK-LABEL: memcpy_p0_p0_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x10
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:17
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:21
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:19
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:25
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:23
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:29
-; CHECK-NEXT:    flat_load_ubyte v19, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ubyte v20, v[2:3] offset:27
-; CHECK-NEXT:    flat_load_ubyte v21, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:15
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:15
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -634,55 +225,9 @@ define void @memcpy_p0_p1_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p1_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:15
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:13
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:11
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:9
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:7
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:5
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:3
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:1
-; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:1
+; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -694,101 +239,19 @@ define void @memcpy_p0_p1_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p1_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xe
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:13
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:11
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:9
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:7
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:5
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:3
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:1
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:29
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:27
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:25
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:23
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:21
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:19
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:17
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:19
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:30
+; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
+; CHECK-NEXT:    global_load_dwordx3 v[6:8], v[2:3], off offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:18
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:17
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:16
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:15
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -800,104 +263,13 @@ define void @memcpy_p0_p1_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p1_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:15
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:13
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:11
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:9
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:7
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:5
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:3
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:1
-; CHECK-NEXT:    global_load_ubyte v19, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:31
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:29
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:27
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:25
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:23
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:21
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:19
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:17
-; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:18
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:17
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:16
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -909,31 +281,9 @@ define void @memcpy_p0_p1_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p1_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ushort v5, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ushort v6, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ushort v7, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ushort v8, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ushort v9, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ushort v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:2
+; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -945,55 +295,19 @@ define void @memcpy_p0_p1_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p1_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ushort v5, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ushort v6, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ushort v7, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ushort v8, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ushort v9, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ushort v11, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ushort v12, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ushort v13, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ushort v14, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ushort v15, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ushort v16, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ushort v17, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ushort v18, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ushort v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:8
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:30
+; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
+; CHECK-NEXT:    global_load_dwordx3 v[6:8], v[2:3], off offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:6
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:4
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v18 offset:2
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1005,55 +319,13 @@ define void @memcpy_p0_p1_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p1_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ushort v5, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ushort v6, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ushort v7, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ushort v8, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ushort v9, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ushort v11, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ushort v12, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ushort v13, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ushort v14, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ushort v15, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ushort v16, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ushort v17, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ushort v18, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ushort v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:4
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v18 offset:2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1080,35 +352,12 @@ define void @memcpy_p0_p1_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addr
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x1
-; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
-; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off offset:15
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:15
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v8 offset:17
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:15
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v9 offset:21
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:19
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v10 offset:25
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:23
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v11 offset:29
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:27
-; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 24, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 8, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 24, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 24, v10
-; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 8, v10
-; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 24, v11
-; CHECK-NEXT:    v_lshrrev_b32_e32 v9, 8, v11
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:16
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:22
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:20
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:26
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:24
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:30
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:28
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1153,35 +402,12 @@ define void @memcpy_p0_p1_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr a
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x1
-; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
-; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off offset:15
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:15
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v8 offset:17
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:15
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v9 offset:21
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:19
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v10 offset:25
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:23
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v11 offset:29
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:27
-; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 24, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 8, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 24, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 24, v10
-; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 8, v10
-; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 24, v11
-; CHECK-NEXT:    v_lshrrev_b32_e32 v9, 8, v11
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:16
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:22
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:20
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:26
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:24
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:30
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:28
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1211,54 +437,9 @@ define void @memcpy_p0_p3_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p3_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:15
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:14
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:13
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:12
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:11
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:10
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:9
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:8
-; CHECK-NEXT:    ds_read_u8 v11, v2 offset:7
-; CHECK-NEXT:    ds_read_u8 v12, v2 offset:6
-; CHECK-NEXT:    ds_read_u8 v13, v2 offset:5
-; CHECK-NEXT:    ds_read_u8 v14, v2 offset:4
-; CHECK-NEXT:    ds_read_u8 v15, v2 offset:3
-; CHECK-NEXT:    ds_read_u8 v16, v2 offset:2
-; CHECK-NEXT:    ds_read_u8 v17, v2 offset:1
-; CHECK-NEXT:    ds_read_u8 v2, v2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:15
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:13
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:11
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:9
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:7
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:5
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:3
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1270,96 +451,19 @@ define void @memcpy_p0_p3_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p3_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:14
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:13
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:12
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:11
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:10
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:9
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:8
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:7
-; CHECK-NEXT:    ds_read_u8 v11, v2 offset:6
-; CHECK-NEXT:    ds_read_u8 v12, v2 offset:5
-; CHECK-NEXT:    ds_read_u8 v13, v2 offset:4
-; CHECK-NEXT:    ds_read_u8 v14, v2 offset:3
-; CHECK-NEXT:    ds_read_u8 v15, v2 offset:2
-; CHECK-NEXT:    ds_read_u8 v16, v2 offset:1
-; CHECK-NEXT:    ds_read_u8 v17, v2
-; CHECK-NEXT:    ds_read_u8 v18, v2 offset:15
-; CHECK-NEXT:    ds_read_u8 v19, v2 offset:16
-; CHECK-NEXT:    ds_read_u8 v20, v2 offset:17
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:13
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:11
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:9
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:7
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:5
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:3
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:30
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:29
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:28
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:27
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:26
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:25
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:24
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:23
-; CHECK-NEXT:    ds_read_u8 v11, v2 offset:22
-; CHECK-NEXT:    ds_read_u8 v12, v2 offset:21
-; CHECK-NEXT:    ds_read_u8 v13, v2 offset:20
-; CHECK-NEXT:    ds_read_u8 v14, v2 offset:19
-; CHECK-NEXT:    ds_read_u8 v2, v2 offset:18
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:29
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:27
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:26
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:25
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:23
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:22
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:21
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:20
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:19
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:17
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:16
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:15
+; CHECK-NEXT:    ds_read_u8 v9, v2 offset:30
+; CHECK-NEXT:    ds_read_b32 v8, v2 offset:24
+; CHECK-NEXT:    ds_read_u16 v10, v2 offset:28
+; CHECK-NEXT:    ds_read_b64 v[6:7], v2 offset:16
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1371,100 +475,12 @@ define void @memcpy_p0_p3_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p3_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:15
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:14
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:13
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:12
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:11
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:10
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:9
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:8
-; CHECK-NEXT:    ds_read_u8 v11, v2 offset:7
-; CHECK-NEXT:    ds_read_u8 v12, v2 offset:6
-; CHECK-NEXT:    ds_read_u8 v13, v2 offset:5
-; CHECK-NEXT:    ds_read_u8 v14, v2 offset:4
-; CHECK-NEXT:    ds_read_u8 v15, v2 offset:3
-; CHECK-NEXT:    ds_read_u8 v16, v2 offset:2
-; CHECK-NEXT:    ds_read_u8 v17, v2 offset:1
-; CHECK-NEXT:    ds_read_u8 v18, v2
-; CHECK-NEXT:    ds_read_u8 v19, v2 offset:16
-; CHECK-NEXT:    ds_read_u8 v20, v2 offset:17
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:15
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:13
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:11
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:9
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:7
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:5
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:3
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:31
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:30
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:29
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:28
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:27
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:26
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:25
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:24
-; CHECK-NEXT:    ds_read_u8 v11, v2 offset:23
-; CHECK-NEXT:    ds_read_u8 v12, v2 offset:22
-; CHECK-NEXT:    ds_read_u8 v13, v2 offset:21
-; CHECK-NEXT:    ds_read_u8 v14, v2 offset:20
-; CHECK-NEXT:    ds_read_u8 v15, v2 offset:19
-; CHECK-NEXT:    ds_read_u8 v2, v2 offset:18
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:31
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:29
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:27
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:26
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:25
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:23
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:22
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:21
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:20
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:19
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:17
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:16
+; CHECK-NEXT:    ds_read2_b64 v[3:6], v2 offset0:2 offset1:3
+; CHECK-NEXT:    ds_read2_b64 v[7:10], v2 offset1:1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1476,30 +492,9 @@ define void @memcpy_p0_p3_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p3_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u16 v3, v2 offset:14
-; CHECK-NEXT:    ds_read_u16 v4, v2 offset:12
-; CHECK-NEXT:    ds_read_u16 v5, v2 offset:10
-; CHECK-NEXT:    ds_read_u16 v6, v2 offset:8
-; CHECK-NEXT:    ds_read_u16 v7, v2 offset:6
-; CHECK-NEXT:    ds_read_u16 v8, v2 offset:4
-; CHECK-NEXT:    ds_read_u16 v9, v2 offset:2
-; CHECK-NEXT:    ds_read_u16 v2, v2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v3 offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1511,54 +506,19 @@ define void @memcpy_p0_p3_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p3_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:30
-; CHECK-NEXT:    ds_read_u16 v4, v2 offset:28
-; CHECK-NEXT:    ds_read_u16 v5, v2 offset:26
-; CHECK-NEXT:    ds_read_u16 v6, v2 offset:24
-; CHECK-NEXT:    ds_read_u16 v7, v2 offset:22
-; CHECK-NEXT:    ds_read_u16 v8, v2 offset:20
-; CHECK-NEXT:    ds_read_u16 v9, v2 offset:18
-; CHECK-NEXT:    ds_read_u16 v10, v2 offset:16
-; CHECK-NEXT:    ds_read_u16 v11, v2 offset:14
-; CHECK-NEXT:    ds_read_u16 v12, v2 offset:12
-; CHECK-NEXT:    ds_read_u16 v13, v2 offset:10
-; CHECK-NEXT:    ds_read_u16 v14, v2 offset:8
-; CHECK-NEXT:    ds_read_u16 v15, v2 offset:6
-; CHECK-NEXT:    ds_read_u16 v16, v2 offset:4
-; CHECK-NEXT:    ds_read_u16 v17, v2 offset:2
-; CHECK-NEXT:    ds_read_u16 v2, v2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:26
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:22
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:20
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:18
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    ds_read_u8 v9, v2 offset:30
+; CHECK-NEXT:    ds_read_b32 v8, v2 offset:24
+; CHECK-NEXT:    ds_read_u16 v10, v2 offset:28
+; CHECK-NEXT:    ds_read_b64 v[6:7], v2 offset:16
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1570,54 +530,12 @@ define void @memcpy_p0_p3_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p3_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u16 v3, v2 offset:30
-; CHECK-NEXT:    ds_read_u16 v4, v2 offset:28
-; CHECK-NEXT:    ds_read_u16 v5, v2 offset:26
-; CHECK-NEXT:    ds_read_u16 v6, v2 offset:24
-; CHECK-NEXT:    ds_read_u16 v7, v2 offset:22
-; CHECK-NEXT:    ds_read_u16 v8, v2 offset:20
-; CHECK-NEXT:    ds_read_u16 v9, v2 offset:18
-; CHECK-NEXT:    ds_read_u16 v10, v2 offset:16
-; CHECK-NEXT:    ds_read_u16 v11, v2 offset:14
-; CHECK-NEXT:    ds_read_u16 v12, v2 offset:12
-; CHECK-NEXT:    ds_read_u16 v13, v2 offset:10
-; CHECK-NEXT:    ds_read_u16 v14, v2 offset:8
-; CHECK-NEXT:    ds_read_u16 v15, v2 offset:6
-; CHECK-NEXT:    ds_read_u16 v16, v2 offset:4
-; CHECK-NEXT:    ds_read_u16 v17, v2 offset:2
-; CHECK-NEXT:    ds_read_u16 v2, v2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v3 offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:26
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:22
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:20
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:18
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    ds_read2_b64 v[3:6], v2 offset0:2 offset1:3
+; CHECK-NEXT:    ds_read2_b64 v[7:10], v2 offset1:1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1643,35 +561,12 @@ define void @memcpy_p0_p3_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p3_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read2_b64 v[3:6], v2 offset1:1
-; CHECK-NEXT:    ds_read_b128 v[7:10], v2 offset:15
+; CHECK-NEXT:    ds_read_b128 v[3:6], v2 offset:15
+; CHECK-NEXT:    ds_read2_b64 v[7:10], v2 offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v7 offset:17
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:15
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v8 offset:21
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:19
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v9 offset:25
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:23
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v10 offset:29
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:27
-; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 24, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 8, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 24, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 8, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 24, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 8, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 24, v10
-; CHECK-NEXT:    v_lshrrev_b32_e32 v9, 8, v10
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:16
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:22
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:20
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:26
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:24
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:30
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:28
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1714,35 +609,12 @@ define void @memcpy_p0_p3_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr a
 ; CHECK-LABEL: memcpy_p0_p3_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_b128 v[3:6], v2
-; CHECK-NEXT:    ds_read_b128 v[7:10], v2 offset:15
+; CHECK-NEXT:    ds_read_b128 v[3:6], v2 offset:15
+; CHECK-NEXT:    ds_read_b128 v[7:10], v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v7 offset:17
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:15
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v8 offset:21
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:19
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v9 offset:25
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:23
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v10 offset:29
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:27
-; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 24, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 8, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 24, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 8, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 24, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 8, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 24, v10
-; CHECK-NEXT:    v_lshrrev_b32_e32 v9, 8, v10
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:16
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:22
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:20
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:26
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:24
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:30
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:28
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1771,55 +643,12 @@ define void @memcpy_p0_p4_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p4_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:15
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:13
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:11
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:9
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:7
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:5
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:3
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:1
-; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:1
+; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
+; CHECK-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[2:3] offset:8
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1831,100 +660,24 @@ define void @memcpy_p0_p4_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p4_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:1
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:2
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:3
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:4
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:5
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:6
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:7
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:8
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:9
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:10
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:11
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:12
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:13
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:29
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:27
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:25
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:23
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:21
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:19
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:17
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:16
+; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
+; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:8
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5] offset:8
+; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5] offset:16
+; CHECK-NEXT:    global_load_dword v4, v[2:3], off offset:24
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dword v[0:1], v4 offset:24
+; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:28
+; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:15
+; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:30
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1936,104 +689,18 @@ define void @memcpy_p0_p4_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p4_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:15
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:13
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:11
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:9
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:7
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:5
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:3
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:1
-; CHECK-NEXT:    global_load_ubyte v19, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:31
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:29
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:27
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:25
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:23
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:21
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:19
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:17
-; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:17
+; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:16
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
+; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:8
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5] offset:8
+; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5] offset:16
+; CHECK-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off offset:24
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[2:3] offset:24
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2045,30 +712,12 @@ define void @memcpy_p0_p4_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p4_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v4
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:2
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:4
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:6
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:8
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:10
+; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:10
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:12
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
+; CHECK-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:12
-; CHECK-NEXT:    global_load_ushort v2, v[2:3], off offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2 offset:14
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[2:3] offset:8
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2080,55 +729,24 @@ define void @memcpy_p0_p4_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p4_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ushort v5, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ushort v6, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ushort v7, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ushort v8, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ushort v9, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ushort v11, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ushort v12, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ushort v13, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ushort v14, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ushort v15, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ushort v16, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ushort v17, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ushort v18, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ushort v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v18 offset:2
+; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
+; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:8
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5] offset:8
+; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5] offset:16
+; CHECK-NEXT:    global_load_dword v4, v[2:3], off offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dword v[0:1], v4 offset:24
+; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:28
+; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:30
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2140,55 +758,18 @@ define void @memcpy_p0_p4_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p4_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ushort v5, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ushort v6, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ushort v7, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ushort v8, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ushort v9, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ushort v11, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ushort v12, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ushort v13, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ushort v14, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ushort v15, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ushort v16, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ushort v17, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ushort v18, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ushort v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v18 offset:2
+; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
+; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5] offset:8
+; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5] offset:16
+; CHECK-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off offset:24
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[2:3] offset:24
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2219,30 +800,7 @@ define void @memcpy_p0_p4_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addr
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
 ; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v2 offset:17
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:15
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v3 offset:21
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:19
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v4 offset:25
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:23
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v5 offset:29
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:27
-; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
-; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
-; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
-; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 8, v3
-; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 24, v4
-; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 8, v4
-; CHECK-NEXT:    v_lshrrev_b32_e32 v9, 24, v5
-; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 8, v5
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:16
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:22
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:20
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:26
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:24
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:28
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5] offset:15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2290,30 +848,7 @@ define void @memcpy_p0_p4_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr a
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
 ; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v2 offset:17
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:15
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v3 offset:21
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:19
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v4 offset:25
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:23
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v5 offset:29
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:27
-; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
-; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
-; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
-; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 8, v3
-; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 24, v4
-; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 8, v4
-; CHECK-NEXT:    v_lshrrev_b32_e32 v9, 24, v5
-; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 8, v5
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:16
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:22
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:20
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:26
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:24
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:28
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5] offset:15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2342,55 +877,13 @@ define void @memcpy_p0_p5_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p5_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:1
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2402,99 +895,23 @@ define void @memcpy_p0_p5_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p5_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x11
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17
-; CHECK-NEXT:    s_clause 0xc
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:17
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:24
+; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:23
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:19
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:30
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[7:9] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:15
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2506,103 +923,19 @@ define void @memcpy_p0_p5_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p5_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x11
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18
-; CHECK-NEXT:    s_clause 0xd
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:23
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:19
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:16
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2614,31 +947,13 @@ define void @memcpy_p0_p5_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p5_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    buffer_load_ushort v3, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v4, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v5, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v6, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v7, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v8, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v9, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v2, v2, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v3 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:2
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2650,55 +965,23 @@ define void @memcpy_p0_p5_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p5_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ushort v4, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ushort v5, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ushort v6, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v7, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v8, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v9, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v12, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v13, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v14, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v15, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v16, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v17, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v2, v2, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:12
+; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:10
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:2
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:30
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[7:9] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2710,55 +993,19 @@ define void @memcpy_p0_p5_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p5_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ushort v3, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ushort v4, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ushort v5, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ushort v6, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v7, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v8, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v9, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v12, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v13, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v14, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v15, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v16, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v17, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v2, v2, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v3 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:10
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2788,53 +1035,19 @@ define void @memcpy_p0_p5_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p5_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x13
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    s_clause 0x7
 ; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:15
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:22
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:19
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:26
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:23
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:29
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:30
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:27
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10] offset:15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2888,53 +1101,19 @@ define void @memcpy_p0_p5_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr a
 ; CHECK-LABEL: memcpy_p0_p5_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x13
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    s_clause 0x7
 ; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:15
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:22
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:19
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:26
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:23
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:29
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:30
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:27
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10] offset:15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2970,41 +1149,8 @@ define void @memcpy_p1_p0_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p1_p0_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:5
-; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:7
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3]
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:1
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:3
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:13
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:9
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:11
-; CHECK-NEXT:    flat_load_ubyte v2, v[2:3] offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v4, 8, v10
-; CHECK-NEXT:    v_lshl_or_b32 v4, v6, 8, v5
-; CHECK-NEXT:    v_lshl_or_b32 v5, v8, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v11, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v12, 8, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v14, 8, v15
-; CHECK-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v16, 8, v17
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v18, 8, v2
-; CHECK-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
-; CHECK-NEXT:    v_lshl_or_b32 v5, v8, 16, v7
-; CHECK-NEXT:    v_lshl_or_b32 v4, v10, 16, v9
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -3016,79 +1162,15 @@ define void @memcpy_p1_p0_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p1_p0_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:29
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:25
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:27
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:13
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:23
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:21
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ubyte v19, v[2:3] offset:19
-; CHECK-NEXT:    flat_load_ubyte v20, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ubyte v21, v[2:3] offset:17
-; CHECK-NEXT:    flat_load_ubyte v22, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ubyte v23, v[2:3] offset:11
-; CHECK-NEXT:    flat_load_ubyte v24, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ubyte v25, v[2:3] offset:9
-; CHECK-NEXT:    flat_load_ubyte v26, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ubyte v27, v[2:3] offset:7
-; CHECK-NEXT:    flat_load_ubyte v28, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ubyte v29, v[2:3] offset:5
-; CHECK-NEXT:    flat_load_ubyte v30, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ubyte v31, v[2:3] offset:1
-; CHECK-NEXT:    flat_load_ubyte v32, v[2:3]
-; CHECK-NEXT:    flat_load_ubyte v33, v[2:3] offset:3
-; CHECK-NEXT:    flat_load_ubyte v2, v[2:3] offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(25) lgkmcnt(25)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v4, 8, v9
-; CHECK-NEXT:    v_lshl_or_b32 v4, v6, 8, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(24) lgkmcnt(24)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v10, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(22) lgkmcnt(22)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v11, 8, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(20) lgkmcnt(20)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v13, 8, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(19) lgkmcnt(19)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v7, 8, v15
-; CHECK-NEXT:    s_waitcnt vmcnt(18) lgkmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v15, 8, v16
-; CHECK-NEXT:    v_lshl_or_b32 v7, v4, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v17, 8, v18
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v19, 8, v20
-; CHECK-NEXT:    v_lshl_or_b32 v6, v6, 16, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v21, 8, v22
-; CHECK-NEXT:    v_lshl_or_b32 v9, v9, 16, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v23, 8, v24
-; CHECK-NEXT:    v_lshl_or_b32 v5, v11, 16, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v25, 8, v26
-; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 16, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v27, 8, v28
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v29, 8, v30
-; CHECK-NEXT:    v_lshl_or_b32 v4, v14, 16, v12
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    flat_load_dwordx2 v[6:7], v[2:3] offset:23
+; CHECK-NEXT:    flat_load_dwordx2 v[8:9], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v18, v31, 8, v32
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v33, 8, v2
-; CHECK-NEXT:    v_lshl_or_b32 v3, v17, 16, v16
-; CHECK-NEXT:    v_lshl_or_b32 v2, v2, 16, v18
 ; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:23
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
 ; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -3100,79 +1182,13 @@ define void @memcpy_p1_p0_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p1_p0_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:29
-; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:31
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:25
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:27
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:13
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:23
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:21
-; CHECK-NEXT:    flat_load_ubyte v19, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ubyte v20, v[2:3] offset:19
-; CHECK-NEXT:    flat_load_ubyte v21, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ubyte v22, v[2:3] offset:17
-; CHECK-NEXT:    flat_load_ubyte v23, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ubyte v24, v[2:3] offset:11
-; CHECK-NEXT:    flat_load_ubyte v25, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ubyte v26, v[2:3] offset:9
-; CHECK-NEXT:    flat_load_ubyte v27, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ubyte v28, v[2:3] offset:7
-; CHECK-NEXT:    flat_load_ubyte v29, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ubyte v30, v[2:3] offset:5
-; CHECK-NEXT:    flat_load_ubyte v31, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ubyte v32, v[2:3] offset:1
-; CHECK-NEXT:    flat_load_ubyte v33, v[2:3]
-; CHECK-NEXT:    flat_load_ubyte v34, v[2:3] offset:3
-; CHECK-NEXT:    flat_load_ubyte v2, v[2:3] offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(25) lgkmcnt(25)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v4, 8, v10
-; CHECK-NEXT:    v_lshl_or_b32 v4, v6, 8, v5
-; CHECK-NEXT:    v_lshl_or_b32 v6, v8, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(24) lgkmcnt(24)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v11, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(22) lgkmcnt(22)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v12, 8, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(20) lgkmcnt(20)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v14, 8, v15
-; CHECK-NEXT:    v_lshl_or_b32 v5, v4, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(18) lgkmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v16, 8, v17
-; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 16, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v18, 8, v19
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v20, 8, v21
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v22, 8, v23
-; CHECK-NEXT:    v_lshl_or_b32 v3, v9, 16, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v24, 8, v25
-; CHECK-NEXT:    v_lshl_or_b32 v9, v12, 16, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v26, 8, v27
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v28, 8, v29
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v30, 8, v31
-; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 16, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v18, v32, 8, v33
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v19, v34, 8, v2
-; CHECK-NEXT:    v_lshl_or_b32 v2, v11, 16, v10
-; CHECK-NEXT:    v_lshl_or_b32 v7, v17, 16, v16
-; CHECK-NEXT:    v_lshl_or_b32 v6, v19, 16, v18
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:16
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 32, i1 false)
@@ -3183,23 +1199,8 @@ define void @memcpy_p1_p0_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p1_p0_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    flat_load_ushort v4, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ushort v5, v[2:3]
-; CHECK-NEXT:    flat_load_ushort v6, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ushort v7, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ushort v8, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ushort v9, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ushort v11, v[2:3] offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v4, 16, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v9, 16, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v10, 16, v6
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v11, 16, v7
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -3211,52 +1212,16 @@ define void @memcpy_p1_p0_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p1_p0_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x13
-; CHECK-NEXT:    flat_load_ushort v4, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ushort v5, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:25
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:23
-; CHECK-NEXT:    flat_load_ushort v6, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ushort v12, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ushort v13, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ushort v14, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:29
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:27
-; CHECK-NEXT:    flat_load_ushort v19, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ushort v20, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ushort v21, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ushort v22, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ushort v23, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ushort v2, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v8, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v10, 8, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v4, 16, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v12, 16, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v13, 16, v14
-; CHECK-NEXT:    v_lshl_or_b32 v8, v8, 16, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v15, 8, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v17, 8, v18
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v19, 16, v20
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    flat_load_dwordx2 v[6:7], v[2:3] offset:23
+; CHECK-NEXT:    flat_load_dwordx2 v[8:9], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v21, 16, v22
-; CHECK-NEXT:    v_lshl_or_b32 v9, v13, 16, v12
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:23
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v23, 16, v2
-; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:16
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off offset:23
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -3267,39 +1232,13 @@ define void @memcpy_p1_p0_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p1_p0_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ushort v4, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ushort v5, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ushort v6, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ushort v7, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ushort v8, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ushort v11, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ushort v9, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ushort v12, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ushort v13, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ushort v14, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ushort v15, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ushort v16, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ushort v17, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ushort v18, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ushort v19, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v4, 16, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v6, 16, v9
-; CHECK-NEXT:    v_lshl_or_b32 v9, v7, 16, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v12, 16, v13
-; CHECK-NEXT:    v_lshl_or_b32 v8, v10, 16, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v14, 16, v15
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v16, 16, v17
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v18, 16, v19
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:16
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 32, i1 false)
@@ -3323,47 +1262,13 @@ define void @memcpy_p1_p0_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p1_p0_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x10
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:25
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:23
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:29
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:27
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:21
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:19
-; CHECK-NEXT:    flat_load_ubyte v19, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ubyte v20, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v21, v[2:3] offset:17
-; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v7, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v9, 8, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v11, 8, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v13, 8, v14
-; CHECK-NEXT:    v_lshl_or_b32 v8, v7, 16, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v15, 8, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v17, 8, v18
-; CHECK-NEXT:    v_lshl_or_b32 v9, v11, 16, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v19, 8, v20
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:15
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v6, 8, v21
-; CHECK-NEXT:    v_lshl_or_b32 v7, v14, 16, v12
-; CHECK-NEXT:    v_lshl_or_b32 v6, v6, 16, v15
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off offset:15
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 31, i1 false)
@@ -3404,47 +1309,13 @@ define void @memcpy_p1_p0_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr a
 ; CHECK-LABEL: memcpy_p1_p0_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x10
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:25
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:23
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:29
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:27
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:21
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:19
-; CHECK-NEXT:    flat_load_ubyte v19, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ubyte v20, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v21, v[2:3] offset:17
-; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v7, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v9, 8, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v11, 8, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v13, 8, v14
-; CHECK-NEXT:    v_lshl_or_b32 v8, v7, 16, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v15, 8, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v17, 8, v18
-; CHECK-NEXT:    v_lshl_or_b32 v9, v11, 16, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v19, 8, v20
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:15
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v6, 8, v21
-; CHECK-NEXT:    v_lshl_or_b32 v7, v14, 16, v12
-; CHECK-NEXT:    v_lshl_or_b32 v6, v6, 16, v15
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off offset:15
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 31, i1 false)
@@ -4042,44 +1913,13 @@ define void @memcpy_p1_p5_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p1_p5_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v4, 8, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v6, 8, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v8, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v10, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v12, 8, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v14, 8, v13
-; CHECK-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v16, 8, v15
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v2, 8, v17
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
-; CHECK-NEXT:    v_lshl_or_b32 v5, v9, 16, v6
-; CHECK-NEXT:    v_lshl_or_b32 v4, v11, 16, v10
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 16, i1 false)
@@ -4090,81 +1930,21 @@ define void @memcpy_p1_p5_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p1_p5_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v4, 8, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v6, 8, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v8, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v10, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v12, 8, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v14, 8, v13
-; CHECK-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v16, 8, v15
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v18, 8, v17
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v21, 8, v20
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v23, 8, v22
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v19, 8, v25
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v26, 8, v24
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v28, 8, v27
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v30, 8, v29
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v18, v31, 8, v21
-; CHECK-NEXT:    v_lshl_or_b32 v7, v13, 16, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v19, v2, 8, v32
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
-; CHECK-NEXT:    v_lshl_or_b32 v5, v9, 16, v6
-; CHECK-NEXT:    v_lshl_or_b32 v4, v11, 16, v10
-; CHECK-NEXT:    v_lshl_or_b32 v6, v14, 16, v15
-; CHECK-NEXT:    v_lshl_or_b32 v9, v17, 16, v16
-; CHECK-NEXT:    v_lshl_or_b32 v8, v19, 16, v18
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:16
-; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off offset:23
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[7:8], off offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[9:10], off offset:23
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -4175,81 +1955,19 @@ define void @memcpy_p1_p5_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p1_p5_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v4, 8, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v6, 8, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v8, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v10, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v12, 8, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v14, 8, v13
-; CHECK-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v16, 8, v15
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v18, 8, v17
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v21, 8, v20
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v23, 8, v22
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v19, 8, v25
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v26, 8, v24
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v28, 8, v27
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v30, 8, v29
-; CHECK-NEXT:    v_lshl_or_b32 v7, v13, 16, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v18, v32, 8, v31
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v19, v2, 8, v33
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
-; CHECK-NEXT:    v_lshl_or_b32 v5, v9, 16, v6
-; CHECK-NEXT:    v_lshl_or_b32 v4, v11, 16, v10
-; CHECK-NEXT:    v_lshl_or_b32 v6, v14, 16, v15
-; CHECK-NEXT:    v_lshl_or_b32 v9, v17, 16, v16
-; CHECK-NEXT:    v_lshl_or_b32 v8, v19, 16, v18
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 32, i1 false)
@@ -4260,24 +1978,13 @@ define void @memcpy_p1_p5_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p1_p5_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    buffer_load_ushort v4, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v7, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v8, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v9, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v6, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v8, 16, v7
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v10, 16, v9
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 16, i1 false)
@@ -4288,52 +1995,21 @@ define void @memcpy_p1_p5_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p1_p5_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x13
-; CHECK-NEXT:    buffer_load_ushort v4, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v7, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v8, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v9, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ushort v19, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v20, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v21, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v22, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v6, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v8, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v10, 16, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v12, 8, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v14, 8, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v16, 8, v15
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v18, 8, v17
-; CHECK-NEXT:    v_lshl_or_b32 v9, v9, 16, v10
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v20, 16, v19
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[7:8], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v22, 16, v21
-; CHECK-NEXT:    v_lshl_or_b32 v8, v12, 16, v11
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:16
-; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off offset:23
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[9:10], off offset:23
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -4344,41 +2020,19 @@ define void @memcpy_p1_p5_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p1_p5_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ushort v4, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v7, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v8, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v9, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v12, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ushort v13, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v14, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v15, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ushort v16, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ushort v17, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v18, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v6, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v8, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v10, 16, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v12, 16, v11
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v14, 16, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v16, 16, v15
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v18, 16, v17
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 32, i1 false)
@@ -4406,49 +2060,18 @@ define void @memcpy_p1_p5_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p1_p5_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x13
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:18
+; CHECK-NEXT:    s_clause 0x7
 ; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v8, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v10, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v12, 8, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v14, 8, v13
-; CHECK-NEXT:    v_lshl_or_b32 v10, v2, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v16, 8, v15
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v18, 8, v17
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v20, 8, v19
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v22, 8, v21
-; CHECK-NEXT:    v_lshl_or_b32 v9, v9, 16, v8
-; CHECK-NEXT:    v_lshl_or_b32 v8, v12, 16, v11
-; CHECK-NEXT:    v_lshl_or_b32 v7, v14, 16, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:15
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -4500,49 +2123,18 @@ define void @memcpy_p1_p5_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr a
 ; CHECK-LABEL: memcpy_p1_p5_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x13
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:18
+; CHECK-NEXT:    s_clause 0x7
 ; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v8, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v10, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v12, 8, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v14, 8, v13
-; CHECK-NEXT:    v_lshl_or_b32 v10, v2, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v16, 8, v15
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v18, 8, v17
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v20, 8, v19
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v22, 8, v21
-; CHECK-NEXT:    v_lshl_or_b32 v9, v9, 16, v8
-; CHECK-NEXT:    v_lshl_or_b32 v8, v12, 16, v11
-; CHECK-NEXT:    v_lshl_or_b32 v7, v14, 16, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:15
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -4577,41 +2169,8 @@ define void @memcpy_p3_p0_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p3_p0_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:5
-; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:7
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2]
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:1
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:3
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:13
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:9
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:11
-; CHECK-NEXT:    flat_load_ubyte v1, v[1:2] offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 8, v9
-; CHECK-NEXT:    v_lshl_or_b32 v3, v5, 8, v4
-; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 8, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v10, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v11, 8, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v13, 8, v14
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 8, v16
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v17, 8, v1
-; CHECK-NEXT:    v_lshl_or_b32 v1, v5, 16, v4
-; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 16, v6
-; CHECK-NEXT:    v_lshl_or_b32 v3, v9, 16, v8
 ; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -4624,80 +2183,16 @@ define void @memcpy_p3_p0_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p3_p0_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:29
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:25
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:27
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:13
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:23
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:21
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ubyte v18, v[1:2] offset:19
-; CHECK-NEXT:    flat_load_ubyte v19, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ubyte v20, v[1:2] offset:17
-; CHECK-NEXT:    flat_load_ubyte v21, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ubyte v22, v[1:2] offset:11
-; CHECK-NEXT:    flat_load_ubyte v23, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ubyte v24, v[1:2] offset:9
-; CHECK-NEXT:    flat_load_ubyte v25, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ubyte v26, v[1:2] offset:7
-; CHECK-NEXT:    flat_load_ubyte v27, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ubyte v28, v[1:2] offset:5
-; CHECK-NEXT:    flat_load_ubyte v29, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ubyte v30, v[1:2] offset:1
-; CHECK-NEXT:    flat_load_ubyte v31, v[1:2]
-; CHECK-NEXT:    flat_load_ubyte v32, v[1:2] offset:3
-; CHECK-NEXT:    flat_load_ubyte v1, v[1:2] offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(25) lgkmcnt(25)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 8, v8
-; CHECK-NEXT:    v_lshl_or_b32 v3, v5, 8, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(24) lgkmcnt(24)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v9, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(22) lgkmcnt(22)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v10, 8, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(20) lgkmcnt(20)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v12, 8, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(19) lgkmcnt(19)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v6, 8, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(18) lgkmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v14, 8, v15
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v16, 8, v17
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v18, 8, v19
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v20, 8, v21
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v22, 8, v23
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v24, 8, v25
-; CHECK-NEXT:    v_lshl_or_b32 v3, v9, 16, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v26, 8, v27
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v28, 8, v29
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    flat_load_dwordx2 v[5:6], v[1:2] offset:23
+; CHECK-NEXT:    flat_load_dwordx2 v[7:8], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v30, 8, v31
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v32, 8, v1
-; CHECK-NEXT:    v_lshl_or_b32 v1, v5, 16, v4
-; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 16, v6
-; CHECK-NEXT:    v_lshl_or_b32 v6, v10, 16, v12
-; CHECK-NEXT:    v_lshl_or_b32 v5, v13, 16, v11
-; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 16, v14
-; CHECK-NEXT:    v_lshl_or_b32 v7, v17, 16, v16
-; CHECK-NEXT:    ds_write_b64 v0, v[1:2] offset:23
-; CHECK-NEXT:    ds_write_b64 v0, v[3:4] offset:16
-; CHECK-NEXT:    ds_write2_b64 v0, v[7:8], v[5:6] offset1:1
+; CHECK-NEXT:    ds_write_b64 v0, v[5:6] offset:23
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(2)
+; CHECK-NEXT:    ds_write_b64 v0, v[7:8] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(2)
+; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -4709,79 +2204,13 @@ define void @memcpy_p3_p0_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p3_p0_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:29
-; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:31
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:25
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:27
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:13
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:23
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:21
-; CHECK-NEXT:    flat_load_ubyte v18, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ubyte v19, v[1:2] offset:19
-; CHECK-NEXT:    flat_load_ubyte v20, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ubyte v21, v[1:2] offset:17
-; CHECK-NEXT:    flat_load_ubyte v22, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ubyte v23, v[1:2] offset:11
-; CHECK-NEXT:    flat_load_ubyte v24, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ubyte v25, v[1:2] offset:9
-; CHECK-NEXT:    flat_load_ubyte v26, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ubyte v27, v[1:2] offset:7
-; CHECK-NEXT:    flat_load_ubyte v28, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ubyte v29, v[1:2] offset:5
-; CHECK-NEXT:    flat_load_ubyte v30, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ubyte v31, v[1:2] offset:1
-; CHECK-NEXT:    flat_load_ubyte v32, v[1:2]
-; CHECK-NEXT:    flat_load_ubyte v33, v[1:2] offset:3
-; CHECK-NEXT:    flat_load_ubyte v1, v[1:2] offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(25) lgkmcnt(25)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 8, v9
-; CHECK-NEXT:    v_lshl_or_b32 v3, v5, 8, v4
-; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 8, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(24) lgkmcnt(24)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v10, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(22) lgkmcnt(22)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v11, 8, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(20) lgkmcnt(20)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v13, 8, v14
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(18) lgkmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v15, 8, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v17, 8, v18
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v19, 8, v20
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v21, 8, v22
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v23, 8, v24
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v25, 8, v26
-; CHECK-NEXT:    v_lshl_or_b32 v3, v9, 16, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v27, 8, v28
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v29, 8, v30
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v31, 8, v32
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v33, 8, v1
-; CHECK-NEXT:    v_lshl_or_b32 v1, v5, 16, v4
-; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 16, v6
-; CHECK-NEXT:    v_lshl_or_b32 v6, v11, 16, v10
-; CHECK-NEXT:    v_lshl_or_b32 v5, v13, 16, v12
-; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 16, v14
-; CHECK-NEXT:    v_lshl_or_b32 v7, v17, 16, v16
-; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[1:2] offset0:2 offset1:3
-; CHECK-NEXT:    ds_write2_b64 v0, v[7:8], v[5:6] offset1:1
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[5:6] offset0:2 offset1:3
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT:    ds_write2_b64 v0, v[7:8], v[9:10] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -4793,23 +2222,8 @@ define void @memcpy_p3_p0_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p3_p0_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    flat_load_ushort v3, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ushort v4, v[1:2]
-; CHECK-NEXT:    flat_load_ushort v5, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ushort v6, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ushort v7, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ushort v8, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ushort v10, v[1:2] offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v8, 16, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v9, 16, v5
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v10, 16, v6
 ; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -4822,51 +2236,16 @@ define void @memcpy_p3_p0_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p3_p0_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x13
-; CHECK-NEXT:    flat_load_ushort v3, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ushort v4, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:25
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:23
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ushort v10, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ushort v11, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ushort v12, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:29
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:27
-; CHECK-NEXT:    flat_load_ushort v17, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ushort v18, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ushort v19, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ushort v20, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ushort v21, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ushort v22, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v7, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v10, 16, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v11, 16, v12
-; CHECK-NEXT:    v_lshl_or_b32 v11, v5, 8, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v13, 8, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v15, 8, v16
-; CHECK-NEXT:    v_lshl_or_b32 v7, v11, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v17, 16, v18
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    flat_load_dwordx2 v[5:6], v[1:2] offset:23
+; CHECK-NEXT:    flat_load_dwordx2 v[7:8], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v19, 16, v20
-; CHECK-NEXT:    v_lshl_or_b32 v8, v10, 16, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v21, 16, v22
-; CHECK-NEXT:    ds_write_b64 v0, v[1:2] offset:16
-; CHECK-NEXT:    ds_write2_b64 v0, v[5:6], v[3:4] offset1:1
-; CHECK-NEXT:    ds_write_b64 v0, v[7:8] offset:23
+; CHECK-NEXT:    ds_write_b64 v0, v[5:6] offset:23
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(2)
+; CHECK-NEXT:    ds_write_b64 v0, v[7:8] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(2)
+; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -4878,40 +2257,13 @@ define void @memcpy_p3_p0_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p3_p0_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ushort v3, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ushort v4, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ushort v5, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ushort v6, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ushort v7, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ushort v8, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ushort v10, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ushort v11, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ushort v12, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ushort v13, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ushort v14, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ushort v15, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ushort v16, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ushort v17, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ushort v18, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v6, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v5, 16, v10
-; CHECK-NEXT:    v_lshl_or_b32 v5, v8, 16, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v11, 16, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v13, 16, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 16, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v17, 16, v18
-; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[1:2] offset0:2 offset1:3
-; CHECK-NEXT:    ds_write2_b64 v0, v[7:8], v[5:6] offset1:1
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[5:6] offset0:2 offset1:3
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT:    ds_write2_b64 v0, v[7:8], v[9:10] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -4937,47 +2289,13 @@ define void @memcpy_p3_p0_sz31_align_8_8(ptr addrspace(3) align 8 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p3_p0_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x10
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:25
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:23
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:29
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:27
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:21
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:19
-; CHECK-NEXT:    flat_load_ubyte v18, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ubyte v19, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v20, v[1:2] offset:17
-; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v6, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v8, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v10, 8, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v12, 8, v13
-; CHECK-NEXT:    v_lshl_or_b32 v7, v6, 16, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v14, 8, v15
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v16, 8, v17
-; CHECK-NEXT:    v_lshl_or_b32 v8, v10, 16, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v18, 8, v19
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2]
+; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2] offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v5, 8, v20
-; CHECK-NEXT:    v_lshl_or_b32 v6, v13, 16, v11
-; CHECK-NEXT:    v_lshl_or_b32 v5, v5, 16, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
-; CHECK-NEXT:    ds_write_b128 v0, v[5:8] offset:15
+; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT:    ds_write_b128 v0, v[7:10] offset:15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -5021,47 +2339,13 @@ define void @memcpy_p3_p0_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr a
 ; CHECK-LABEL: memcpy_p3_p0_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x10
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:25
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:23
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:29
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:27
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:21
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:19
-; CHECK-NEXT:    flat_load_ubyte v18, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ubyte v19, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v20, v[1:2] offset:17
-; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v6, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v8, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v10, 8, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v12, 8, v13
-; CHECK-NEXT:    v_lshl_or_b32 v7, v6, 16, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v14, 8, v15
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v16, 8, v17
-; CHECK-NEXT:    v_lshl_or_b32 v8, v10, 16, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v18, 8, v19
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2] offset:15
+; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v5, 8, v20
-; CHECK-NEXT:    v_lshl_or_b32 v6, v13, 16, v11
-; CHECK-NEXT:    v_lshl_or_b32 v5, v5, 16, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_write_b128 v0, v[1:4]
-; CHECK-NEXT:    ds_write_b128 v0, v[5:8] offset:15
+; CHECK-NEXT:    ds_write_b128 v0, v[3:6] offset:15
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT:    ds_write_b128 v0, v[7:10]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -5701,44 +2985,13 @@ define void @memcpy_p3_p5_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p3_p5_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v3, 8, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v5, 8, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v7, 8, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v9, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v11, 8, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v13, 8, v12
-; CHECK-NEXT:    v_lshl_or_b32 v2, v7, 16, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v15, 8, v14
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v1, 8, v16
-; CHECK-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; CHECK-NEXT:    v_lshl_or_b32 v4, v8, 16, v5
-; CHECK-NEXT:    v_lshl_or_b32 v3, v10, 16, v9
-; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -5750,81 +3003,21 @@ define void @memcpy_p3_p5_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p3_p5_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v3, 8, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v5, 8, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v7, 8, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v9, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v11, 8, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v13, 8, v12
-; CHECK-NEXT:    v_lshl_or_b32 v2, v7, 16, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v15, 8, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v17, 8, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v20, 8, v19
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v22, 8, v21
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v18, 8, v24
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v25, 8, v23
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v27, 8, v26
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v29, 8, v28
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
+; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v30, 8, v20
-; CHECK-NEXT:    v_lshl_or_b32 v6, v12, 16, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v18, v1, 8, v31
-; CHECK-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; CHECK-NEXT:    v_lshl_or_b32 v4, v8, 16, v5
-; CHECK-NEXT:    v_lshl_or_b32 v3, v10, 16, v9
-; CHECK-NEXT:    v_lshl_or_b32 v5, v13, 16, v14
-; CHECK-NEXT:    v_lshl_or_b32 v8, v16, 16, v15
-; CHECK-NEXT:    v_lshl_or_b32 v7, v18, 16, v17
-; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
-; CHECK-NEXT:    ds_write_b64 v0, v[5:6] offset:16
-; CHECK-NEXT:    ds_write_b64 v0, v[7:8] offset:23
+; CHECK-NEXT:    ds_write_b64 v0, v[6:7] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ds_write_b64 v0, v[8:9] offset:23
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -5836,81 +3029,19 @@ define void @memcpy_p3_p5_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p3_p5_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v3, 8, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v5, 8, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v7, 8, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v9, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v11, 8, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v13, 8, v12
-; CHECK-NEXT:    v_lshl_or_b32 v2, v7, 16, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v15, 8, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v17, 8, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v20, 8, v19
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v22, 8, v21
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v18, 8, v24
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v25, 8, v23
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v27, 8, v26
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v29, 8, v28
-; CHECK-NEXT:    v_lshl_or_b32 v6, v12, 16, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v31, 8, v30
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v18, v1, 8, v32
-; CHECK-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; CHECK-NEXT:    v_lshl_or_b32 v4, v8, 16, v5
-; CHECK-NEXT:    v_lshl_or_b32 v3, v10, 16, v9
-; CHECK-NEXT:    v_lshl_or_b32 v5, v13, 16, v14
-; CHECK-NEXT:    v_lshl_or_b32 v8, v16, 16, v15
-; CHECK-NEXT:    v_lshl_or_b32 v7, v18, 16, v17
-; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
-; CHECK-NEXT:    ds_write2_b64 v0, v[5:6], v[7:8] offset0:2 offset1:3
+; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ds_write2_b64 v0, v[6:7], v[8:9] offset0:2 offset1:3
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -5922,24 +3053,13 @@ define void @memcpy_p3_p5_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p3_p5_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    buffer_load_ushort v3, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v7, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v8, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v9, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 16, v6
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v9, 16, v8
-; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -5951,52 +3071,21 @@ define void @memcpy_p3_p5_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p3_p5_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x13
-; CHECK-NEXT:    buffer_load_ushort v3, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v7, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v8, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v9, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ushort v18, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v19, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v20, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v21, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v7, 16, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v9, 16, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v11, 8, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v13, 8, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v15, 8, v14
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v17, 8, v16
-; CHECK-NEXT:    v_lshl_or_b32 v8, v8, 16, v9
+; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v19, 16, v18
+; CHECK-NEXT:    ds_write_b64 v0, v[6:7] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v21, 16, v20
-; CHECK-NEXT:    v_lshl_or_b32 v7, v11, 16, v10
-; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
-; CHECK-NEXT:    ds_write_b64 v0, v[5:6] offset:16
-; CHECK-NEXT:    ds_write_b64 v0, v[7:8] offset:23
+; CHECK-NEXT:    ds_write_b64 v0, v[8:9] offset:23
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -6008,41 +3097,19 @@ define void @memcpy_p3_p5_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p3_p5_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ushort v3, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v7, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v8, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v9, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v10, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v11, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ushort v12, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v13, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v14, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ushort v15, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ushort v16, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v17, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v7, 16, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v9, 16, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v11, 16, v10
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v13, 16, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 16, v14
+; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v17, 16, v16
-; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
-; CHECK-NEXT:    ds_write2_b64 v0, v[5:6], v[7:8] offset0:2 offset1:3
+; CHECK-NEXT:    ds_write2_b64 v0, v[6:7], v[8:9] offset0:2 offset1:3
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -6072,50 +3139,19 @@ define void @memcpy_p3_p5_sz31_align_8_8(ptr addrspace(3) align 8 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p3_p5_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x13
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 8, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v9, 8, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v11, 8, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v13, 8, v12
-; CHECK-NEXT:    v_lshl_or_b32 v4, v1, 16, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v15, 8, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v17, 8, v16
-; CHECK-NEXT:    v_lshl_or_b32 v2, v11, 16, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v19, 8, v18
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v21, 8, v20
-; CHECK-NEXT:    v_lshl_or_b32 v3, v9, 16, v3
-; CHECK-NEXT:    v_lshl_or_b32 v1, v13, 16, v12
+; CHECK-NEXT:    ds_write2_b64 v0, v[6:7], v[8:9] offset1:1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    ds_write2_b64 v0, v[5:6], v[7:8] offset1:1
-; CHECK-NEXT:    ds_write_b128 v0, v[1:4] offset:15
+; CHECK-NEXT:    ds_write_b128 v0, v[2:5] offset:15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -6169,49 +3205,18 @@ define void @memcpy_p3_p5_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr a
 ; CHECK-LABEL: memcpy_p3_p5_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x13
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:18
+; CHECK-NEXT:    s_clause 0x7
 ; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v7, 8, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v9, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v11, 8, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v13, 8, v12
-; CHECK-NEXT:    v_lshl_or_b32 v9, v1, 16, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v15, 8, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v17, 8, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v19, 8, v18
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v21, 8, v20
-; CHECK-NEXT:    v_lshl_or_b32 v8, v8, 16, v7
-; CHECK-NEXT:    v_lshl_or_b32 v7, v11, 16, v10
-; CHECK-NEXT:    v_lshl_or_b32 v6, v13, 16, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    ds_write_b128 v0, v[2:5]
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    ds_write_b128 v0, v[6:9] offset:15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -6248,55 +3253,12 @@ define void @memcpy_p5_p0_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p0_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:13
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:11
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:9
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:7
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:5
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:3
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:1
-; CHECK-NEXT:    flat_load_ubyte v1, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 16, i1 false)
@@ -6307,101 +3269,24 @@ define void @memcpy_p5_p0_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p0_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xe
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:13
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:11
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:9
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:7
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:5
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:3
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:1
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:29
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:27
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:25
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:23
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:21
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:19
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:17
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ubyte v1, v[1:2] offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:30
+; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:28
+; CHECK-NEXT:    flat_load_dwordx3 v[5:7], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -6412,104 +3297,19 @@ define void @memcpy_p5_p0_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p0_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:13
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:11
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:9
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:7
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:5
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:3
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:1
-; CHECK-NEXT:    flat_load_ubyte v18, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:31
-; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:29
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:27
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:25
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:23
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:21
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:19
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:17
-; CHECK-NEXT:    flat_load_ubyte v1, v[1:2] offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 32, i1 false)
@@ -6520,31 +3320,12 @@ define void @memcpy_p5_p0_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p0_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    flat_load_ushort v3, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ushort v4, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ushort v5, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ushort v6, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ushort v7, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ushort v8, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ushort v1, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 16, i1 false)
@@ -6555,55 +3336,24 @@ define void @memcpy_p5_p0_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p0_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ushort v4, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ushort v5, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ushort v6, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ushort v7, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ushort v8, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ushort v10, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ushort v11, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ushort v12, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ushort v13, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ushort v14, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ushort v15, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ushort v16, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ushort v17, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ushort v1, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:30
+; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:28
+; CHECK-NEXT:    flat_load_dwordx3 v[5:7], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -6614,55 +3364,19 @@ define void @memcpy_p5_p0_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p0_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ushort v3, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ushort v4, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ushort v5, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ushort v6, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ushort v7, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ushort v8, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ushort v10, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ushort v11, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ushort v12, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ushort v13, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ushort v14, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ushort v15, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ushort v16, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ushort v17, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ushort v1, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 32, i1 false)
@@ -6689,61 +3403,19 @@ define void @memcpy_p5_p0_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p0_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x10
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:17
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:21
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:19
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:25
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:23
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:29
-; CHECK-NEXT:    flat_load_ubyte v18, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ubyte v19, v[1:2] offset:27
-; CHECK-NEXT:    flat_load_ubyte v20, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2] offset:15
+; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 31, i1 false)
@@ -6793,61 +3465,19 @@ define void @memcpy_p5_p0_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr a
 ; CHECK-LABEL: memcpy_p5_p0_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x10
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:17
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:21
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:19
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:25
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:23
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:29
-; CHECK-NEXT:    flat_load_ubyte v18, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ubyte v19, v[1:2] offset:27
-; CHECK-NEXT:    flat_load_ubyte v20, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2] offset:15
+; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 31, i1 false)
@@ -6881,55 +3511,12 @@ define void @memcpy_p5_p1_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p1_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:15
-; CHECK-NEXT:    global_load_ubyte v4, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off offset:13
-; CHECK-NEXT:    global_load_ubyte v6, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ubyte v7, v[1:2], off offset:11
-; CHECK-NEXT:    global_load_ubyte v8, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:9
-; CHECK-NEXT:    global_load_ubyte v10, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ubyte v11, v[1:2], off offset:7
-; CHECK-NEXT:    global_load_ubyte v12, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ubyte v13, v[1:2], off offset:5
-; CHECK-NEXT:    global_load_ubyte v14, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ubyte v15, v[1:2], off offset:3
-; CHECK-NEXT:    global_load_ubyte v16, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ubyte v17, v[1:2], off offset:1
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 16, i1 false)
@@ -6940,207 +3527,47 @@ define void @memcpy_p5_p1_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p1_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off
-; CHECK-NEXT:    global_load_ubyte v4, v[1:2], off offset:1
-; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ubyte v6, v[1:2], off offset:3
-; CHECK-NEXT:    global_load_ubyte v7, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ubyte v8, v[1:2], off offset:5
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ubyte v10, v[1:2], off offset:7
-; CHECK-NEXT:    global_load_ubyte v11, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ubyte v12, v[1:2], off offset:9
-; CHECK-NEXT:    global_load_ubyte v13, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ubyte v14, v[1:2], off offset:11
-; CHECK-NEXT:    global_load_ubyte v15, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ubyte v16, v[1:2], off offset:13
-; CHECK-NEXT:    global_load_ubyte v17, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ubyte v18, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ubyte v19, v[1:2], off offset:29
-; CHECK-NEXT:    global_load_ubyte v20, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v21, v[1:2], off offset:27
-; CHECK-NEXT:    global_load_ubyte v22, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ubyte v23, v[1:2], off offset:25
-; CHECK-NEXT:    global_load_ubyte v24, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ubyte v25, v[1:2], off offset:23
-; CHECK-NEXT:    global_load_ubyte v26, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ubyte v27, v[1:2], off offset:21
-; CHECK-NEXT:    global_load_ubyte v28, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ubyte v29, v[1:2], off offset:19
-; CHECK-NEXT:    global_load_ubyte v30, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ubyte v31, v[1:2], off offset:17
-; CHECK-NEXT:    global_load_ubyte v32, v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    global_load_dwordx3 v[5:7], v[1:2], off offset:16
+; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:28
+; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:30
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 31, i1 false)
   ret void
-}
-
-define void @memcpy_p5_p1_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
-; CHECK-LABEL: memcpy_p5_p1_sz32_align_1_1:
-; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:15
-; CHECK-NEXT:    global_load_ubyte v4, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off offset:13
-; CHECK-NEXT:    global_load_ubyte v6, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ubyte v7, v[1:2], off offset:11
-; CHECK-NEXT:    global_load_ubyte v8, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:9
-; CHECK-NEXT:    global_load_ubyte v10, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ubyte v11, v[1:2], off offset:7
-; CHECK-NEXT:    global_load_ubyte v12, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ubyte v13, v[1:2], off offset:5
-; CHECK-NEXT:    global_load_ubyte v14, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ubyte v15, v[1:2], off offset:3
-; CHECK-NEXT:    global_load_ubyte v16, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ubyte v17, v[1:2], off offset:1
-; CHECK-NEXT:    global_load_ubyte v18, v[1:2], off
-; CHECK-NEXT:    global_load_ubyte v19, v[1:2], off offset:31
-; CHECK-NEXT:    global_load_ubyte v20, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ubyte v21, v[1:2], off offset:29
-; CHECK-NEXT:    global_load_ubyte v22, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v23, v[1:2], off offset:27
-; CHECK-NEXT:    global_load_ubyte v24, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ubyte v25, v[1:2], off offset:25
-; CHECK-NEXT:    global_load_ubyte v26, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ubyte v27, v[1:2], off offset:23
-; CHECK-NEXT:    global_load_ubyte v28, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ubyte v29, v[1:2], off offset:21
-; CHECK-NEXT:    global_load_ubyte v30, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ubyte v31, v[1:2], off offset:19
-; CHECK-NEXT:    global_load_ubyte v32, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ubyte v33, v[1:2], off offset:17
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(31)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:18
+}
+
+define void @memcpy_p5_p1_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p1_sz32_align_1_1:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v33, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 32, i1 false)
@@ -7151,31 +3578,12 @@ define void @memcpy_p5_p1_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p1_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    global_load_ushort v3, v[1:2], off
-; CHECK-NEXT:    global_load_ushort v4, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ushort v5, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ushort v6, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ushort v7, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ushort v9, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ushort v1, v[1:2], off offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 16, i1 false)
@@ -7186,55 +3594,24 @@ define void @memcpy_p5_p1_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p1_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ushort v4, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ushort v5, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ushort v6, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ushort v7, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ushort v9, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ushort v10, v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v11, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ushort v12, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ushort v13, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ushort v14, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ushort v15, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ushort v16, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ushort v17, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ushort v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    global_load_dwordx3 v[5:7], v[1:2], off offset:16
+; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:28
+; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:30
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -7245,55 +3622,19 @@ define void @memcpy_p5_p1_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p1_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ushort v3, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ushort v4, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ushort v5, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ushort v6, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ushort v7, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ushort v9, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ushort v10, v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v11, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ushort v12, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ushort v13, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ushort v14, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ushort v15, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ushort v16, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ushort v17, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ushort v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 32, i1 false)
@@ -7329,30 +3670,10 @@ define void @memcpy_p5_p1_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addr
 ; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_store_byte_d16_hi v10, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 24, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 8, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 24, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 8, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 24, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 8, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 24, v10
-; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 8, v10
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:15
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 31, i1 false)
@@ -7411,30 +3732,10 @@ define void @memcpy_p5_p1_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr a
 ; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_store_byte_d16_hi v10, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 24, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 8, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 24, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 8, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 24, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 8, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 24, v10
-; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 8, v10
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:15
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 31, i1 false)
@@ -7468,54 +3769,12 @@ define void @memcpy_p5_p3_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p3_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v2, v1 offset:15
-; CHECK-NEXT:    ds_read_u8 v3, v1 offset:14
-; CHECK-NEXT:    ds_read_u8 v4, v1 offset:13
-; CHECK-NEXT:    ds_read_u8 v5, v1 offset:12
-; CHECK-NEXT:    ds_read_u8 v6, v1 offset:11
-; CHECK-NEXT:    ds_read_u8 v7, v1 offset:10
-; CHECK-NEXT:    ds_read_u8 v8, v1 offset:9
-; CHECK-NEXT:    ds_read_u8 v9, v1 offset:8
-; CHECK-NEXT:    ds_read_u8 v10, v1 offset:7
-; CHECK-NEXT:    ds_read_u8 v11, v1 offset:6
-; CHECK-NEXT:    ds_read_u8 v12, v1 offset:5
-; CHECK-NEXT:    ds_read_u8 v13, v1 offset:4
-; CHECK-NEXT:    ds_read_u8 v14, v1 offset:3
-; CHECK-NEXT:    ds_read_u8 v15, v1 offset:2
-; CHECK-NEXT:    ds_read_u8 v16, v1 offset:1
-; CHECK-NEXT:    ds_read_u8 v1, v1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    ds_read2_b64 v[1:4], v1 offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 16, i1 false)
@@ -7526,85 +3785,25 @@ define void @memcpy_p5_p3_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p3_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v2, v1
-; CHECK-NEXT:    ds_read_u8 v3, v1 offset:1
-; CHECK-NEXT:    ds_read_u8 v4, v1 offset:2
-; CHECK-NEXT:    ds_read_u8 v5, v1 offset:3
-; CHECK-NEXT:    ds_read_u8 v6, v1 offset:4
-; CHECK-NEXT:    ds_read_u8 v7, v1 offset:5
-; CHECK-NEXT:    ds_read_u8 v8, v1 offset:6
-; CHECK-NEXT:    ds_read_u8 v9, v1 offset:7
-; CHECK-NEXT:    ds_read_u8 v10, v1 offset:8
-; CHECK-NEXT:    ds_read_u8 v11, v1 offset:9
-; CHECK-NEXT:    ds_read_u8 v12, v1 offset:10
-; CHECK-NEXT:    ds_read_u8 v13, v1 offset:11
-; CHECK-NEXT:    ds_read_u8 v14, v1 offset:12
-; CHECK-NEXT:    ds_read_u8 v15, v1 offset:13
-; CHECK-NEXT:    ds_read_u8 v16, v1 offset:14
-; CHECK-NEXT:    ds_read_u8 v17, v1 offset:15
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    ds_read_u8 v2, v1 offset:24
-; CHECK-NEXT:    ds_read_u8 v3, v1 offset:25
-; CHECK-NEXT:    ds_read_u8 v4, v1 offset:26
-; CHECK-NEXT:    ds_read_u8 v18, v1 offset:27
-; CHECK-NEXT:    ds_read_u8 v19, v1 offset:28
-; CHECK-NEXT:    ds_read_u8 v20, v1 offset:29
-; CHECK-NEXT:    ds_read_u8 v21, v1 offset:30
-; CHECK-NEXT:    ds_read_u8 v22, v1 offset:16
-; CHECK-NEXT:    ds_read_u8 v23, v1 offset:17
-; CHECK-NEXT:    ds_read_u8 v24, v1 offset:18
-; CHECK-NEXT:    ds_read_u8 v25, v1 offset:19
-; CHECK-NEXT:    ds_read_u8 v26, v1 offset:20
-; CHECK-NEXT:    ds_read_u8 v27, v1 offset:21
-; CHECK-NEXT:    ds_read_u8 v28, v1 offset:22
-; CHECK-NEXT:    ds_read_u8 v1, v1 offset:23
-; CHECK-NEXT:    s_waitcnt lgkmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt lgkmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt lgkmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt lgkmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt lgkmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt lgkmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt lgkmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    ds_read_b32 v8, v1 offset:24
+; CHECK-NEXT:    ds_read_u16 v9, v1 offset:28
+; CHECK-NEXT:    ds_read_u8 v10, v1 offset:30
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v1 offset1:1
+; CHECK-NEXT:    ds_read_b64 v[6:7], v1 offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
+; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -7615,79 +3814,18 @@ define void @memcpy_p5_p3_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p3_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v2, v1 offset:15
-; CHECK-NEXT:    ds_read_u8 v3, v1 offset:14
-; CHECK-NEXT:    ds_read_u8 v4, v1 offset:13
-; CHECK-NEXT:    ds_read_u8 v5, v1 offset:12
-; CHECK-NEXT:    ds_read_u8 v6, v1 offset:11
-; CHECK-NEXT:    ds_read_u8 v7, v1 offset:8
-; CHECK-NEXT:    ds_read_u8 v8, v1 offset:9
-; CHECK-NEXT:    ds_read_u8 v9, v1 offset:10
-; CHECK-NEXT:    ds_read_u8 v10, v1
-; CHECK-NEXT:    ds_read_u8 v11, v1 offset:1
-; CHECK-NEXT:    ds_read_u8 v12, v1 offset:2
-; CHECK-NEXT:    ds_read_u8 v13, v1 offset:3
-; CHECK-NEXT:    ds_read_u8 v14, v1 offset:4
-; CHECK-NEXT:    ds_read_u8 v15, v1 offset:5
-; CHECK-NEXT:    ds_read_u8 v16, v1 offset:6
-; CHECK-NEXT:    ds_read_u8 v17, v1 offset:7
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    ds_read_u8 v2, v1 offset:24
-; CHECK-NEXT:    ds_read_u8 v3, v1 offset:25
-; CHECK-NEXT:    ds_read_u8 v4, v1 offset:26
-; CHECK-NEXT:    ds_read_u8 v5, v1 offset:27
-; CHECK-NEXT:    ds_read_u8 v6, v1 offset:28
-; CHECK-NEXT:    ds_read_u8 v18, v1 offset:29
-; CHECK-NEXT:    ds_read_u8 v19, v1 offset:30
-; CHECK-NEXT:    ds_read_u8 v20, v1 offset:31
-; CHECK-NEXT:    ds_read_u8 v21, v1 offset:16
-; CHECK-NEXT:    ds_read_u8 v22, v1 offset:17
-; CHECK-NEXT:    ds_read_u8 v23, v1 offset:18
-; CHECK-NEXT:    ds_read_u8 v24, v1 offset:19
-; CHECK-NEXT:    ds_read_u8 v25, v1 offset:20
-; CHECK-NEXT:    ds_read_u8 v26, v1 offset:21
-; CHECK-NEXT:    ds_read_u8 v27, v1 offset:22
-; CHECK-NEXT:    ds_read_u8 v1, v1 offset:23
-; CHECK-NEXT:    s_waitcnt lgkmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v1 offset1:1
+; CHECK-NEXT:    ds_read2_b64 v[6:9], v1 offset0:2 offset1:3
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 32, i1 false)
@@ -7698,30 +3836,12 @@ define void @memcpy_p5_p3_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p3_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u16 v2, v1
-; CHECK-NEXT:    ds_read_u16 v3, v1 offset:2
-; CHECK-NEXT:    ds_read_u16 v4, v1 offset:4
-; CHECK-NEXT:    ds_read_u16 v5, v1 offset:6
-; CHECK-NEXT:    ds_read_u16 v6, v1 offset:8
-; CHECK-NEXT:    ds_read_u16 v7, v1 offset:10
-; CHECK-NEXT:    ds_read_u16 v8, v1 offset:12
-; CHECK-NEXT:    ds_read_u16 v1, v1 offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    ds_read2_b64 v[1:4], v1 offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 16, i1 false)
@@ -7732,54 +3852,25 @@ define void @memcpy_p5_p3_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p3_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v2, v1 offset:30
-; CHECK-NEXT:    ds_read_u16 v3, v1 offset:28
-; CHECK-NEXT:    ds_read_u16 v4, v1 offset:26
-; CHECK-NEXT:    ds_read_u16 v5, v1 offset:24
-; CHECK-NEXT:    ds_read_u16 v6, v1 offset:22
-; CHECK-NEXT:    ds_read_u16 v7, v1 offset:20
-; CHECK-NEXT:    ds_read_u16 v8, v1 offset:18
-; CHECK-NEXT:    ds_read_u16 v9, v1 offset:16
-; CHECK-NEXT:    ds_read_u16 v10, v1 offset:14
-; CHECK-NEXT:    ds_read_u16 v11, v1 offset:12
-; CHECK-NEXT:    ds_read_u16 v12, v1 offset:10
-; CHECK-NEXT:    ds_read_u16 v13, v1 offset:8
-; CHECK-NEXT:    ds_read_u16 v14, v1 offset:6
-; CHECK-NEXT:    ds_read_u16 v15, v1 offset:4
-; CHECK-NEXT:    ds_read_u16 v16, v1 offset:2
-; CHECK-NEXT:    ds_read_u16 v1, v1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT:    ds_read_b32 v8, v1 offset:24
+; CHECK-NEXT:    ds_read_u16 v9, v1 offset:28
+; CHECK-NEXT:    ds_read_u8 v10, v1 offset:30
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v1 offset1:1
+; CHECK-NEXT:    ds_read_b64 v[6:7], v1 offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -7790,54 +3881,18 @@ define void @memcpy_p5_p3_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p3_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u16 v2, v1 offset:30
-; CHECK-NEXT:    ds_read_u16 v3, v1 offset:28
-; CHECK-NEXT:    ds_read_u16 v4, v1 offset:26
-; CHECK-NEXT:    ds_read_u16 v5, v1 offset:24
-; CHECK-NEXT:    ds_read_u16 v6, v1 offset:22
-; CHECK-NEXT:    ds_read_u16 v7, v1 offset:20
-; CHECK-NEXT:    ds_read_u16 v8, v1 offset:18
-; CHECK-NEXT:    ds_read_u16 v9, v1 offset:16
-; CHECK-NEXT:    ds_read_u16 v10, v1 offset:14
-; CHECK-NEXT:    ds_read_u16 v11, v1 offset:12
-; CHECK-NEXT:    ds_read_u16 v12, v1 offset:10
-; CHECK-NEXT:    ds_read_u16 v13, v1 offset:8
-; CHECK-NEXT:    ds_read_u16 v14, v1 offset:6
-; CHECK-NEXT:    ds_read_u16 v15, v1 offset:4
-; CHECK-NEXT:    ds_read_u16 v16, v1 offset:2
-; CHECK-NEXT:    ds_read_u16 v1, v1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v1 offset1:1
+; CHECK-NEXT:    ds_read2_b64 v[6:9], v1 offset0:2 offset1:3
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 32, i1 false)
@@ -7872,30 +3927,10 @@ define void @memcpy_p5_p3_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addr
 ; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte_d16_hi v6, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 24, v6
-; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 8, v6
-; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 24, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 8, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 24, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 8, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 8, v9
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:15
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 31, i1 false)
@@ -7952,30 +3987,10 @@ define void @memcpy_p5_p3_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr a
 ; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte_d16_hi v6, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 24, v6
-; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 8, v6
-; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 24, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 8, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 24, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 8, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 8, v9
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:15
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 31, i1 false)
@@ -8008,55 +4023,12 @@ define void @memcpy_p5_p4_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p4_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:15
-; CHECK-NEXT:    global_load_ubyte v4, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off offset:13
-; CHECK-NEXT:    global_load_ubyte v6, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ubyte v7, v[1:2], off offset:11
-; CHECK-NEXT:    global_load_ubyte v8, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:9
-; CHECK-NEXT:    global_load_ubyte v10, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ubyte v11, v[1:2], off offset:7
-; CHECK-NEXT:    global_load_ubyte v12, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ubyte v13, v[1:2], off offset:5
-; CHECK-NEXT:    global_load_ubyte v14, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ubyte v15, v[1:2], off offset:3
-; CHECK-NEXT:    global_load_ubyte v16, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ubyte v17, v[1:2], off offset:1
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false)
@@ -8067,100 +4039,24 @@ define void @memcpy_p5_p4_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p4_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off
-; CHECK-NEXT:    global_load_ubyte v4, v[1:2], off offset:1
-; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ubyte v6, v[1:2], off offset:3
-; CHECK-NEXT:    global_load_ubyte v7, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ubyte v8, v[1:2], off offset:5
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ubyte v10, v[1:2], off offset:7
-; CHECK-NEXT:    global_load_ubyte v11, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ubyte v12, v[1:2], off offset:9
-; CHECK-NEXT:    global_load_ubyte v13, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ubyte v14, v[1:2], off offset:11
-; CHECK-NEXT:    global_load_ubyte v15, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ubyte v16, v[1:2], off offset:13
-; CHECK-NEXT:    global_load_ubyte v17, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ubyte v18, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ubyte v19, v[1:2], off offset:29
-; CHECK-NEXT:    global_load_ubyte v20, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v21, v[1:2], off offset:27
-; CHECK-NEXT:    global_load_ubyte v22, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ubyte v23, v[1:2], off offset:25
-; CHECK-NEXT:    global_load_ubyte v24, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ubyte v25, v[1:2], off offset:23
-; CHECK-NEXT:    global_load_ubyte v26, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ubyte v27, v[1:2], off offset:21
-; CHECK-NEXT:    global_load_ubyte v28, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ubyte v29, v[1:2], off offset:19
-; CHECK-NEXT:    global_load_ubyte v30, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ubyte v31, v[1:2], off offset:17
-; CHECK-NEXT:    global_load_ubyte v32, v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    global_load_dwordx3 v[5:7], v[1:2], off offset:16
+; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:28
+; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:30
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -8171,103 +4067,19 @@ define void @memcpy_p5_p4_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p4_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:15
-; CHECK-NEXT:    global_load_ubyte v4, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off offset:13
-; CHECK-NEXT:    global_load_ubyte v6, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ubyte v7, v[1:2], off offset:11
-; CHECK-NEXT:    global_load_ubyte v8, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:9
-; CHECK-NEXT:    global_load_ubyte v10, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ubyte v11, v[1:2], off offset:7
-; CHECK-NEXT:    global_load_ubyte v12, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ubyte v13, v[1:2], off offset:5
-; CHECK-NEXT:    global_load_ubyte v14, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ubyte v15, v[1:2], off offset:3
-; CHECK-NEXT:    global_load_ubyte v16, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ubyte v17, v[1:2], off offset:1
-; CHECK-NEXT:    global_load_ubyte v18, v[1:2], off
-; CHECK-NEXT:    global_load_ubyte v19, v[1:2], off offset:31
-; CHECK-NEXT:    global_load_ubyte v20, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ubyte v21, v[1:2], off offset:29
-; CHECK-NEXT:    global_load_ubyte v22, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v23, v[1:2], off offset:27
-; CHECK-NEXT:    global_load_ubyte v24, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ubyte v25, v[1:2], off offset:25
-; CHECK-NEXT:    global_load_ubyte v26, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ubyte v27, v[1:2], off offset:23
-; CHECK-NEXT:    global_load_ubyte v28, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ubyte v29, v[1:2], off offset:21
-; CHECK-NEXT:    global_load_ubyte v30, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ubyte v31, v[1:2], off offset:19
-; CHECK-NEXT:    global_load_ubyte v32, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ubyte v33, v[1:2], off offset:17
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(31)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v33, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 32, i1 false)
@@ -8278,31 +4090,12 @@ define void @memcpy_p5_p4_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p4_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    global_load_ushort v3, v[1:2], off
-; CHECK-NEXT:    global_load_ushort v4, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ushort v5, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ushort v6, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ushort v7, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ushort v9, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ushort v1, v[1:2], off offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 16, i1 false)
@@ -8313,55 +4106,24 @@ define void @memcpy_p5_p4_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p4_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ushort v4, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ushort v5, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ushort v6, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ushort v7, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ushort v9, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ushort v10, v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v11, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ushort v12, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ushort v13, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ushort v14, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ushort v15, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ushort v16, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ushort v17, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ushort v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    global_load_dwordx3 v[5:7], v[1:2], off offset:16
+; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:28
+; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:30
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -8372,55 +4134,19 @@ define void @memcpy_p5_p4_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p4_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ushort v3, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ushort v4, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ushort v5, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ushort v6, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ushort v7, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ushort v9, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ushort v10, v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v11, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ushort v12, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ushort v13, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ushort v14, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ushort v15, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ushort v16, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ushort v17, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ushort v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 32, i1 false)
@@ -8456,30 +4182,10 @@ define void @memcpy_p5_p4_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addr
 ; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_store_byte_d16_hi v10, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 24, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 8, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 24, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 8, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 24, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 8, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 24, v10
-; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 8, v10
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:15
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 31, i1 false)
@@ -8538,30 +4244,10 @@ define void @memcpy_p5_p4_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr a
 ; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_store_byte_d16_hi v10, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 24, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 8, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 24, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 8, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 24, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 8, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 24, v10
-; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 8, v10
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:15
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 31, i1 false)
@@ -8595,55 +4281,19 @@ define void @memcpy_p5_p5_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p5_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 16, i1 false)
@@ -8654,99 +4304,34 @@ define void @memcpy_p5_p5_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p5_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x11
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_clause 0xc
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:25
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:21
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -8757,103 +4342,31 @@ define void @memcpy_p5_p5_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p5_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x11
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_clause 0xd
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:25
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:21
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 32, i1 false)
@@ -8864,31 +4377,19 @@ define void @memcpy_p5_p5_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p5_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v3, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v4, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v5, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v7, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v8, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v1, v1, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 16, i1 false)
@@ -8899,55 +4400,34 @@ define void @memcpy_p5_p5_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p5_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ushort v3, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ushort v4, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ushort v5, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v7, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v8, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ushort v9, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v10, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v11, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v12, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v13, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v14, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v15, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v16, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v1, v1, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -8958,55 +4438,31 @@ define void @memcpy_p5_p5_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p5_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ushort v3, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ushort v4, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ushort v5, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v7, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v8, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ushort v9, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v10, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v11, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v12, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v13, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v14, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v15, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v16, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v1, v1, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 32, i1 false)
@@ -9040,67 +4496,31 @@ define void @memcpy_p5_p5_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p5_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x13
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_dword v16, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v17, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_dword v18, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_dword v19, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:29
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:19
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 31, i1 false)
@@ -9169,67 +4589,31 @@ define void @memcpy_p5_p5_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr a
 ; CHECK-LABEL: memcpy_p5_p5_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x13
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_dword v16, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v17, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_dword v18, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_dword v19, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:29
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:19
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 31, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll b/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
index cc5256620bfe08..4e5688adcd6bbd 100644
--- a/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
+++ b/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
@@ -13,55 +13,9 @@ define void @memmove_p0_p0_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p0_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:13
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:11
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:9
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:7
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:5
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:3
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:1
-; CHECK-NEXT:    flat_load_ubyte v2, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -73,100 +27,19 @@ define void @memmove_p0_p0_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p0_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:29
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:27
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:25
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:23
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:21
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:19
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:17
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ubyte v19, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v20, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ubyte v21, v[2:3] offset:13
-; CHECK-NEXT:    flat_load_ubyte v22, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ubyte v23, v[2:3] offset:11
-; CHECK-NEXT:    flat_load_ubyte v24, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ubyte v25, v[2:3] offset:9
-; CHECK-NEXT:    flat_load_ubyte v26, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ubyte v27, v[2:3] offset:7
-; CHECK-NEXT:    flat_load_ubyte v28, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ubyte v29, v[2:3] offset:5
-; CHECK-NEXT:    flat_load_ubyte v30, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ubyte v31, v[2:3] offset:3
-; CHECK-NEXT:    flat_load_ubyte v32, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ubyte v33, v[2:3] offset:1
-; CHECK-NEXT:    flat_load_ubyte v2, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(30) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v26 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v27 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v28 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v29 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v30 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v31 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v32 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v33 offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:30
+; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:28
+; CHECK-NEXT:    flat_load_dwordx3 v[6:8], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -178,103 +51,13 @@ define void @memmove_p0_p0_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p0_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:31
-; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:29
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:27
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:25
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:23
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:21
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:19
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:17
-; CHECK-NEXT:    flat_load_ubyte v19, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ubyte v20, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v21, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ubyte v22, v[2:3] offset:13
-; CHECK-NEXT:    flat_load_ubyte v23, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ubyte v24, v[2:3] offset:11
-; CHECK-NEXT:    flat_load_ubyte v25, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ubyte v26, v[2:3] offset:9
-; CHECK-NEXT:    flat_load_ubyte v27, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ubyte v28, v[2:3] offset:7
-; CHECK-NEXT:    flat_load_ubyte v29, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ubyte v30, v[2:3] offset:5
-; CHECK-NEXT:    flat_load_ubyte v31, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ubyte v32, v[2:3] offset:3
-; CHECK-NEXT:    flat_load_ubyte v33, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ubyte v34, v[2:3] offset:1
-; CHECK-NEXT:    flat_load_ubyte v2, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(31) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(30) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v26 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v27 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v28 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v29 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v30 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v31 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v32 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v33 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v34 offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -286,31 +69,9 @@ define void @memmove_p0_p0_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p0_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    flat_load_ushort v4, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ushort v5, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ushort v6, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ushort v7, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ushort v8, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ushort v9, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ushort v2, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -322,55 +83,19 @@ define void @memmove_p0_p0_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p0_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ushort v5, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ushort v6, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ushort v7, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ushort v8, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ushort v9, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ushort v11, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ushort v12, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ushort v13, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ushort v14, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ushort v15, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ushort v16, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ushort v17, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ushort v18, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ushort v2, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v18 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:30
+; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:28
+; CHECK-NEXT:    flat_load_dwordx3 v[6:8], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -382,55 +107,13 @@ define void @memmove_p0_p0_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p0_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ushort v4, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ushort v5, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ushort v6, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ushort v7, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ushort v8, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ushort v9, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ushort v11, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ushort v12, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ushort v13, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ushort v14, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ushort v15, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ushort v16, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ushort v17, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ushort v18, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ushort v2, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v18 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -559,55 +242,9 @@ define void @memmove_p0_p1_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p1_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:15
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:13
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:11
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:9
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:7
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:5
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:3
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:1
-; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:1
+; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -619,100 +256,19 @@ define void @memmove_p0_p1_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p1_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:29
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:27
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:25
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:23
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:21
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:19
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:17
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ubyte v19, v[2:3], off offset:15
-; CHECK-NEXT:    global_load_ubyte v20, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ubyte v21, v[2:3], off offset:13
-; CHECK-NEXT:    global_load_ubyte v22, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ubyte v23, v[2:3], off offset:11
-; CHECK-NEXT:    global_load_ubyte v24, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ubyte v25, v[2:3], off offset:9
-; CHECK-NEXT:    global_load_ubyte v26, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ubyte v27, v[2:3], off offset:7
-; CHECK-NEXT:    global_load_ubyte v28, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ubyte v29, v[2:3], off offset:5
-; CHECK-NEXT:    global_load_ubyte v30, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ubyte v31, v[2:3], off offset:3
-; CHECK-NEXT:    global_load_ubyte v32, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ubyte v33, v[2:3], off offset:1
-; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v26 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v27 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v28 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v29 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v30 offset:4
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:30
+; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
+; CHECK-NEXT:    global_load_dwordx3 v[6:8], v[2:3], off offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v31 offset:3
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v32 offset:2
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v33 offset:1
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -724,103 +280,13 @@ define void @memmove_p0_p1_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p1_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:31
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:29
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:27
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:25
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:23
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:21
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:19
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:17
-; CHECK-NEXT:    global_load_ubyte v19, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ubyte v20, v[2:3], off offset:15
-; CHECK-NEXT:    global_load_ubyte v21, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ubyte v22, v[2:3], off offset:13
-; CHECK-NEXT:    global_load_ubyte v23, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ubyte v24, v[2:3], off offset:11
-; CHECK-NEXT:    global_load_ubyte v25, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ubyte v26, v[2:3], off offset:9
-; CHECK-NEXT:    global_load_ubyte v27, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ubyte v28, v[2:3], off offset:7
-; CHECK-NEXT:    global_load_ubyte v29, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ubyte v30, v[2:3], off offset:5
-; CHECK-NEXT:    global_load_ubyte v31, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ubyte v32, v[2:3], off offset:3
-; CHECK-NEXT:    global_load_ubyte v33, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ubyte v34, v[2:3], off offset:1
-; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v26 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v27 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v28 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v29 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v30 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v31 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v32 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v33 offset:2
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v34 offset:1
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -832,31 +298,9 @@ define void @memmove_p0_p1_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p1_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ushort v5, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ushort v6, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ushort v7, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ushort v8, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ushort v9, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ushort v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:2
+; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -868,55 +312,19 @@ define void @memmove_p0_p1_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p1_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ushort v5, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ushort v6, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ushort v7, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ushort v8, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ushort v9, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ushort v11, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ushort v12, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ushort v13, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ushort v14, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ushort v15, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ushort v16, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ushort v17, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ushort v18, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ushort v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:8
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:30
+; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
+; CHECK-NEXT:    global_load_dwordx3 v[6:8], v[2:3], off offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:6
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:4
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v18 offset:2
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -928,55 +336,13 @@ define void @memmove_p0_p1_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p1_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ushort v5, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ushort v6, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ushort v7, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ushort v8, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ushort v9, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ushort v11, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ushort v12, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ushort v13, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ushort v14, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ushort v15, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ushort v16, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ushort v17, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ushort v18, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ushort v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:4
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v18 offset:2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1105,54 +471,9 @@ define void @memmove_p0_p3_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p3_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:15
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:14
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:13
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:12
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:11
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:10
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:9
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:8
-; CHECK-NEXT:    ds_read_u8 v11, v2 offset:7
-; CHECK-NEXT:    ds_read_u8 v12, v2 offset:6
-; CHECK-NEXT:    ds_read_u8 v13, v2 offset:5
-; CHECK-NEXT:    ds_read_u8 v14, v2 offset:4
-; CHECK-NEXT:    ds_read_u8 v15, v2 offset:3
-; CHECK-NEXT:    ds_read_u8 v16, v2 offset:2
-; CHECK-NEXT:    ds_read_u8 v17, v2 offset:1
-; CHECK-NEXT:    ds_read_u8 v2, v2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:15
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:13
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:11
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:9
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:7
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:5
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:3
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1164,72 +485,19 @@ define void @memmove_p0_p3_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p3_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:24
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:25
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:26
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:27
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:28
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:29
 ; CHECK-NEXT:    ds_read_u8 v9, v2 offset:30
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:16
-; CHECK-NEXT:    ds_read_u8 v11, v2 offset:17
-; CHECK-NEXT:    ds_read_u8 v12, v2 offset:18
-; CHECK-NEXT:    ds_read_u8 v13, v2 offset:19
-; CHECK-NEXT:    ds_read_u8 v14, v2 offset:20
-; CHECK-NEXT:    ds_read_u8 v15, v2 offset:21
-; CHECK-NEXT:    ds_read_u8 v16, v2 offset:22
-; CHECK-NEXT:    ds_read_u8 v17, v2 offset:23
-; CHECK-NEXT:    ds_read_u8 v18, v2 offset:8
-; CHECK-NEXT:    ds_read_u8 v19, v2 offset:9
-; CHECK-NEXT:    ds_read_u8 v20, v2 offset:10
-; CHECK-NEXT:    ds_read_u8 v21, v2 offset:11
-; CHECK-NEXT:    ds_read_u8 v22, v2 offset:12
-; CHECK-NEXT:    ds_read_u8 v23, v2 offset:13
-; CHECK-NEXT:    ds_read_u8 v24, v2 offset:14
-; CHECK-NEXT:    ds_read_u8 v25, v2 offset:15
-; CHECK-NEXT:    ds_read_u8 v26, v2
-; CHECK-NEXT:    ds_read_u8 v27, v2 offset:1
-; CHECK-NEXT:    ds_read_u8 v28, v2 offset:2
-; CHECK-NEXT:    ds_read_u8 v29, v2 offset:3
-; CHECK-NEXT:    ds_read_u8 v30, v2 offset:4
-; CHECK-NEXT:    ds_read_u8 v31, v2 offset:5
-; CHECK-NEXT:    ds_read_u8 v32, v2 offset:6
-; CHECK-NEXT:    ds_read_u8 v2, v2 offset:7
-; CHECK-NEXT:    s_waitcnt lgkmcnt(24)
+; CHECK-NEXT:    ds_read_b32 v8, v2 offset:24
+; CHECK-NEXT:    ds_read_u16 v10, v2 offset:28
+; CHECK-NEXT:    ds_read_b64 v[6:7], v2 offset:16
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
 ; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:29
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:28
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:27
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:26
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:25
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(23)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:23
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:22
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:21
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:20
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:19
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:17
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(23)
-; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:15
-; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:14
-; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:13
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:12
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:11
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:10
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:9
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(23)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:7
-; CHECK-NEXT:    flat_store_byte v[0:1], v32 offset:6
-; CHECK-NEXT:    flat_store_byte v[0:1], v31 offset:5
-; CHECK-NEXT:    flat_store_byte v[0:1], v30 offset:4
-; CHECK-NEXT:    flat_store_byte v[0:1], v29 offset:3
-; CHECK-NEXT:    flat_store_byte v[0:1], v28 offset:2
-; CHECK-NEXT:    flat_store_byte v[0:1], v27 offset:1
-; CHECK-NEXT:    flat_store_byte v[0:1], v26
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1241,74 +509,12 @@ define void @memmove_p0_p3_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p3_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:24
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:25
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:26
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:27
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:28
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:29
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:30
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:31
-; CHECK-NEXT:    ds_read_u8 v11, v2 offset:16
-; CHECK-NEXT:    ds_read_u8 v12, v2 offset:17
-; CHECK-NEXT:    ds_read_u8 v13, v2 offset:18
-; CHECK-NEXT:    ds_read_u8 v14, v2 offset:19
-; CHECK-NEXT:    ds_read_u8 v15, v2 offset:20
-; CHECK-NEXT:    ds_read_u8 v16, v2 offset:21
-; CHECK-NEXT:    ds_read_u8 v17, v2 offset:22
-; CHECK-NEXT:    ds_read_u8 v18, v2 offset:23
-; CHECK-NEXT:    ds_read_u8 v19, v2 offset:8
-; CHECK-NEXT:    ds_read_u8 v20, v2 offset:9
-; CHECK-NEXT:    ds_read_u8 v21, v2 offset:10
-; CHECK-NEXT:    ds_read_u8 v22, v2 offset:11
-; CHECK-NEXT:    ds_read_u8 v23, v2 offset:12
-; CHECK-NEXT:    ds_read_u8 v24, v2 offset:13
-; CHECK-NEXT:    ds_read_u8 v25, v2 offset:14
-; CHECK-NEXT:    ds_read_u8 v26, v2 offset:15
-; CHECK-NEXT:    ds_read_u8 v27, v2
-; CHECK-NEXT:    ds_read_u8 v28, v2 offset:1
-; CHECK-NEXT:    ds_read_u8 v29, v2 offset:2
-; CHECK-NEXT:    ds_read_u8 v30, v2 offset:3
-; CHECK-NEXT:    ds_read_u8 v31, v2 offset:4
-; CHECK-NEXT:    ds_read_u8 v32, v2 offset:5
-; CHECK-NEXT:    ds_read_u8 v33, v2 offset:6
-; CHECK-NEXT:    ds_read_u8 v2, v2 offset:7
-; CHECK-NEXT:    s_waitcnt lgkmcnt(24)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:31
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:29
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:28
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:27
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:26
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:25
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(24)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:23
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:22
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:21
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:20
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:19
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:17
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(24)
-; CHECK-NEXT:    flat_store_byte v[0:1], v26 offset:15
-; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:14
-; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:13
-; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:12
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:11
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:10
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:9
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(24)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:7
-; CHECK-NEXT:    flat_store_byte v[0:1], v33 offset:6
-; CHECK-NEXT:    flat_store_byte v[0:1], v32 offset:5
-; CHECK-NEXT:    flat_store_byte v[0:1], v31 offset:4
-; CHECK-NEXT:    flat_store_byte v[0:1], v30 offset:3
-; CHECK-NEXT:    flat_store_byte v[0:1], v29 offset:2
-; CHECK-NEXT:    flat_store_byte v[0:1], v28 offset:1
-; CHECK-NEXT:    flat_store_byte v[0:1], v27
+; CHECK-NEXT:    ds_read2_b64 v[3:6], v2 offset0:2 offset1:3
+; CHECK-NEXT:    ds_read2_b64 v[7:10], v2 offset1:1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1320,30 +526,9 @@ define void @memmove_p0_p3_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p3_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u16 v3, v2 offset:14
-; CHECK-NEXT:    ds_read_u16 v4, v2 offset:12
-; CHECK-NEXT:    ds_read_u16 v5, v2 offset:10
-; CHECK-NEXT:    ds_read_u16 v6, v2 offset:8
-; CHECK-NEXT:    ds_read_u16 v7, v2 offset:6
-; CHECK-NEXT:    ds_read_u16 v8, v2 offset:4
-; CHECK-NEXT:    ds_read_u16 v9, v2 offset:2
-; CHECK-NEXT:    ds_read_u16 v2, v2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v3 offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1355,54 +540,19 @@ define void @memmove_p0_p3_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p3_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:30
-; CHECK-NEXT:    ds_read_u16 v4, v2 offset:28
-; CHECK-NEXT:    ds_read_u16 v5, v2 offset:26
-; CHECK-NEXT:    ds_read_u16 v6, v2 offset:24
-; CHECK-NEXT:    ds_read_u16 v7, v2 offset:22
-; CHECK-NEXT:    ds_read_u16 v8, v2 offset:20
-; CHECK-NEXT:    ds_read_u16 v9, v2 offset:18
-; CHECK-NEXT:    ds_read_u16 v10, v2 offset:16
-; CHECK-NEXT:    ds_read_u16 v11, v2 offset:14
-; CHECK-NEXT:    ds_read_u16 v12, v2 offset:12
-; CHECK-NEXT:    ds_read_u16 v13, v2 offset:10
-; CHECK-NEXT:    ds_read_u16 v14, v2 offset:8
-; CHECK-NEXT:    ds_read_u16 v15, v2 offset:6
-; CHECK-NEXT:    ds_read_u16 v16, v2 offset:4
-; CHECK-NEXT:    ds_read_u16 v17, v2 offset:2
-; CHECK-NEXT:    ds_read_u16 v2, v2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:26
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:22
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:20
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:18
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    ds_read_u8 v9, v2 offset:30
+; CHECK-NEXT:    ds_read_b32 v8, v2 offset:24
+; CHECK-NEXT:    ds_read_u16 v10, v2 offset:28
+; CHECK-NEXT:    ds_read_b64 v[6:7], v2 offset:16
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1414,54 +564,12 @@ define void @memmove_p0_p3_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p3_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u16 v3, v2 offset:30
-; CHECK-NEXT:    ds_read_u16 v4, v2 offset:28
-; CHECK-NEXT:    ds_read_u16 v5, v2 offset:26
-; CHECK-NEXT:    ds_read_u16 v6, v2 offset:24
-; CHECK-NEXT:    ds_read_u16 v7, v2 offset:22
-; CHECK-NEXT:    ds_read_u16 v8, v2 offset:20
-; CHECK-NEXT:    ds_read_u16 v9, v2 offset:18
-; CHECK-NEXT:    ds_read_u16 v10, v2 offset:16
-; CHECK-NEXT:    ds_read_u16 v11, v2 offset:14
-; CHECK-NEXT:    ds_read_u16 v12, v2 offset:12
-; CHECK-NEXT:    ds_read_u16 v13, v2 offset:10
-; CHECK-NEXT:    ds_read_u16 v14, v2 offset:8
-; CHECK-NEXT:    ds_read_u16 v15, v2 offset:6
-; CHECK-NEXT:    ds_read_u16 v16, v2 offset:4
-; CHECK-NEXT:    ds_read_u16 v17, v2 offset:2
-; CHECK-NEXT:    ds_read_u16 v2, v2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v3 offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:26
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:22
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:20
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:18
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    ds_read2_b64 v[3:6], v2 offset0:2 offset1:3
+; CHECK-NEXT:    ds_read2_b64 v[7:10], v2 offset1:1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1583,55 +691,9 @@ define void @memmove_p0_p4_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p4_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:15
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:13
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:11
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:9
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:7
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:5
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:3
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:1
-; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:1
+; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1643,100 +705,19 @@ define void @memmove_p0_p4_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p4_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:29
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:27
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:25
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:23
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:21
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:19
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:17
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ubyte v19, v[2:3], off offset:15
-; CHECK-NEXT:    global_load_ubyte v20, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ubyte v21, v[2:3], off offset:13
-; CHECK-NEXT:    global_load_ubyte v22, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ubyte v23, v[2:3], off offset:11
-; CHECK-NEXT:    global_load_ubyte v24, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ubyte v25, v[2:3], off offset:9
-; CHECK-NEXT:    global_load_ubyte v26, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ubyte v27, v[2:3], off offset:7
-; CHECK-NEXT:    global_load_ubyte v28, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ubyte v29, v[2:3], off offset:5
-; CHECK-NEXT:    global_load_ubyte v30, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ubyte v31, v[2:3], off offset:3
-; CHECK-NEXT:    global_load_ubyte v32, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ubyte v33, v[2:3], off offset:1
-; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v26 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v27 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v28 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v29 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v30 offset:4
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:30
+; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
+; CHECK-NEXT:    global_load_dwordx3 v[6:8], v[2:3], off offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v31 offset:3
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v32 offset:2
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v33 offset:1
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1748,103 +729,13 @@ define void @memmove_p0_p4_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p4_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:31
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:29
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:27
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:25
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:23
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:21
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:19
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:17
-; CHECK-NEXT:    global_load_ubyte v19, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ubyte v20, v[2:3], off offset:15
-; CHECK-NEXT:    global_load_ubyte v21, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ubyte v22, v[2:3], off offset:13
-; CHECK-NEXT:    global_load_ubyte v23, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ubyte v24, v[2:3], off offset:11
-; CHECK-NEXT:    global_load_ubyte v25, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ubyte v26, v[2:3], off offset:9
-; CHECK-NEXT:    global_load_ubyte v27, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ubyte v28, v[2:3], off offset:7
-; CHECK-NEXT:    global_load_ubyte v29, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ubyte v30, v[2:3], off offset:5
-; CHECK-NEXT:    global_load_ubyte v31, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ubyte v32, v[2:3], off offset:3
-; CHECK-NEXT:    global_load_ubyte v33, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ubyte v34, v[2:3], off offset:1
-; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v26 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v27 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v28 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v29 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v30 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v31 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v32 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v33 offset:2
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v34 offset:1
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1856,31 +747,9 @@ define void @memmove_p0_p4_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p4_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ushort v5, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ushort v6, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ushort v7, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ushort v8, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ushort v9, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ushort v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:2
+; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1892,55 +761,19 @@ define void @memmove_p0_p4_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p4_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ushort v5, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ushort v6, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ushort v7, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ushort v8, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ushort v9, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ushort v11, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ushort v12, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ushort v13, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ushort v14, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ushort v15, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ushort v16, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ushort v17, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ushort v18, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ushort v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:8
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:30
+; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
+; CHECK-NEXT:    global_load_dwordx3 v[6:8], v[2:3], off offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:6
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:4
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v18 offset:2
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1952,55 +785,13 @@ define void @memmove_p0_p4_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p4_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ushort v5, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ushort v6, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ushort v7, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ushort v8, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ushort v9, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ushort v11, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ushort v12, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ushort v13, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ushort v14, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ushort v15, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ushort v16, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ushort v17, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ushort v18, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ushort v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:4
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v18 offset:2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2129,55 +920,13 @@ define void @memmove_p0_p5_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p5_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:1
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2189,100 +938,23 @@ define void @memmove_p0_p5_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p5_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v26 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v27 offset:6
+; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v28 offset:5
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v29 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v30 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v31 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v32 offset:1
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:30
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[7:9] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2294,103 +966,19 @@ define void @memmove_p0_p5_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p5_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v26 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v27 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v28 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v29 offset:5
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v30 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v31 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v32 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v33 offset:1
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2402,31 +990,13 @@ define void @memmove_p0_p5_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p5_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    buffer_load_ushort v3, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v4, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v5, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v6, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v7, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v8, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v9, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v2, v2, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v3 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:2
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2438,55 +1008,23 @@ define void @memmove_p0_p5_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p5_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ushort v4, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ushort v5, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ushort v6, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v7, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v8, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v9, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v12, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v13, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v14, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v15, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v16, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v17, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v2, v2, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:12
+; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:10
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:2
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:30
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[7:9] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2498,55 +1036,19 @@ define void @memmove_p0_p5_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p5_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ushort v3, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ushort v4, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ushort v5, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ushort v6, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v7, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v8, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v9, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v12, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v13, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v14, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v15, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v16, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v17, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v2, v2, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v3 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:10
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2698,41 +1200,8 @@ define void @memmove_p1_p0_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p0_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:5
-; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:7
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3]
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:1
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:3
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:13
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:9
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:11
-; CHECK-NEXT:    flat_load_ubyte v2, v[2:3] offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v4, 8, v10
-; CHECK-NEXT:    v_lshl_or_b32 v4, v6, 8, v5
-; CHECK-NEXT:    v_lshl_or_b32 v5, v8, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v11, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v12, 8, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v14, 8, v15
-; CHECK-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v16, 8, v17
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v18, 8, v2
-; CHECK-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
-; CHECK-NEXT:    v_lshl_or_b32 v5, v8, 16, v7
-; CHECK-NEXT:    v_lshl_or_b32 v4, v10, 16, v9
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2744,79 +1213,18 @@ define void @memmove_p1_p0_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p0_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:13
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:23
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:21
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:29
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:27
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:25
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:19
-; CHECK-NEXT:    flat_load_ubyte v19, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ubyte v20, v[2:3] offset:17
-; CHECK-NEXT:    flat_load_ubyte v21, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ubyte v22, v[2:3] offset:11
-; CHECK-NEXT:    flat_load_ubyte v23, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ubyte v24, v[2:3] offset:9
-; CHECK-NEXT:    flat_load_ubyte v25, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ubyte v26, v[2:3] offset:7
-; CHECK-NEXT:    flat_load_ubyte v27, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ubyte v28, v[2:3] offset:5
-; CHECK-NEXT:    flat_load_ubyte v29, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ubyte v30, v[2:3] offset:1
-; CHECK-NEXT:    flat_load_ubyte v31, v[2:3]
-; CHECK-NEXT:    flat_load_ubyte v32, v[2:3] offset:3
-; CHECK-NEXT:    flat_load_ubyte v33, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ubyte v2, v[2:3] offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(29) lgkmcnt(29)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v4, 8, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(25) lgkmcnt(25)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v8, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(23) lgkmcnt(23)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v10, 8, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(22) lgkmcnt(22)
-; CHECK-NEXT:    v_lshlrev_b16 v12, 8, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(19) lgkmcnt(19)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v14, 8, v15
-; CHECK-NEXT:    s_waitcnt vmcnt(17) lgkmcnt(17)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v16, 8, v17
-; CHECK-NEXT:    v_lshl_or_b32 v16, v6, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v18, 8, v19
-; CHECK-NEXT:    v_lshl_or_b32 v7, v9, 16, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v20, 8, v21
-; CHECK-NEXT:    v_lshl_or_b32 v8, v14, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    v_lshl_or_b32 v18, v22, 8, v23
-; CHECK-NEXT:    v_lshl_or_b32 v5, v4, 16, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v24, 8, v25
-; CHECK-NEXT:    v_lshl_or_b32 v6, v15, 16, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    v_lshl_or_b32 v20, v26, 8, v27
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    v_lshl_or_b32 v19, v28, 8, v29
-; CHECK-NEXT:    v_lshl_or_b32 v4, v18, 16, v17
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:30
+; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:28
+; CHECK-NEXT:    flat_load_dwordx3 v[6:8], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    v_lshl_or_b32 v21, v30, 8, v31
+; CHECK-NEXT:    global_store_byte v[0:1], v9, off offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT:    global_store_short v[0:1], v10, off offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    v_lshl_or_b32 v22, v32, 8, v33
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_or_b32_e32 v12, v12, v2
-; CHECK-NEXT:    v_lshl_or_b32 v3, v20, 16, v19
-; CHECK-NEXT:    v_lshl_or_b32 v2, v22, 16, v21
-; CHECK-NEXT:    global_store_byte v[0:1], v13, off offset:30
-; CHECK-NEXT:    global_store_short v[0:1], v12, off offset:28
 ; CHECK-NEXT:    global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2828,79 +1236,13 @@ define void @memmove_p1_p0_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p0_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:29
-; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:31
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:25
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:27
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:13
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:23
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:21
-; CHECK-NEXT:    flat_load_ubyte v19, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ubyte v20, v[2:3] offset:19
-; CHECK-NEXT:    flat_load_ubyte v21, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ubyte v22, v[2:3] offset:17
-; CHECK-NEXT:    flat_load_ubyte v23, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ubyte v24, v[2:3] offset:11
-; CHECK-NEXT:    flat_load_ubyte v25, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ubyte v26, v[2:3] offset:9
-; CHECK-NEXT:    flat_load_ubyte v27, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ubyte v28, v[2:3] offset:7
-; CHECK-NEXT:    flat_load_ubyte v29, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ubyte v30, v[2:3] offset:5
-; CHECK-NEXT:    flat_load_ubyte v31, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ubyte v32, v[2:3] offset:1
-; CHECK-NEXT:    flat_load_ubyte v33, v[2:3]
-; CHECK-NEXT:    flat_load_ubyte v34, v[2:3] offset:3
-; CHECK-NEXT:    flat_load_ubyte v2, v[2:3] offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(25) lgkmcnt(25)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v4, 8, v10
-; CHECK-NEXT:    v_lshl_or_b32 v4, v6, 8, v5
-; CHECK-NEXT:    v_lshl_or_b32 v6, v8, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(24) lgkmcnt(24)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v11, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(22) lgkmcnt(22)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v12, 8, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(20) lgkmcnt(20)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v14, 8, v15
-; CHECK-NEXT:    v_lshl_or_b32 v5, v4, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(18) lgkmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v16, 8, v17
-; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 16, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v18, 8, v19
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v20, 8, v21
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v22, 8, v23
-; CHECK-NEXT:    v_lshl_or_b32 v3, v9, 16, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v24, 8, v25
-; CHECK-NEXT:    v_lshl_or_b32 v9, v12, 16, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v26, 8, v27
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v28, 8, v29
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v30, 8, v31
-; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 16, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v18, v32, 8, v33
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v19, v34, 8, v2
-; CHECK-NEXT:    v_lshl_or_b32 v2, v11, 16, v10
-; CHECK-NEXT:    v_lshl_or_b32 v7, v17, 16, v16
-; CHECK-NEXT:    v_lshl_or_b32 v6, v19, 16, v18
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:16
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 32, i1 false)
@@ -2911,23 +1253,8 @@ define void @memmove_p1_p0_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p0_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    flat_load_ushort v4, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ushort v5, v[2:3]
-; CHECK-NEXT:    flat_load_ushort v6, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ushort v7, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ushort v8, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ushort v9, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ushort v11, v[2:3] offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v4, 16, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v9, 16, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v10, 16, v6
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v11, 16, v7
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2939,41 +1266,18 @@ define void @memmove_p1_p0_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p0_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ushort v4, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ushort v5, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ushort v9, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ushort v11, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ushort v6, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ushort v7, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ushort v12, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ushort v13, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ushort v14, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ushort v15, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ushort v16, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ushort v17, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ushort v18, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ushort v19, v[2:3]
-; CHECK-NEXT:    flat_load_ubyte v20, v[2:3] offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v4, 16, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v9, 16, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v6, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v12, 16, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v14, 16, v15
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:30
+; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:28
+; CHECK-NEXT:    flat_load_dwordx3 v[6:8], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v16, 16, v17
+; CHECK-NEXT:    global_store_byte v[0:1], v9, off offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT:    global_store_short v[0:1], v10, off offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v18, 16, v19
-; CHECK-NEXT:    global_store_short v[0:1], v11, off offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_store_byte v[0:1], v20, off offset:30
 ; CHECK-NEXT:    global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2985,39 +1289,13 @@ define void @memmove_p1_p0_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p0_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ushort v4, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ushort v5, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ushort v6, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ushort v7, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ushort v8, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ushort v11, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ushort v9, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ushort v12, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ushort v13, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ushort v14, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ushort v15, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ushort v16, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ushort v17, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ushort v18, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ushort v19, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v4, 16, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v6, 16, v9
-; CHECK-NEXT:    v_lshl_or_b32 v9, v7, 16, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v12, 16, v13
-; CHECK-NEXT:    v_lshl_or_b32 v8, v10, 16, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v14, 16, v15
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v16, 16, v17
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v18, 16, v19
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:16
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 32, i1 false)
@@ -3783,44 +2061,13 @@ define void @memmove_p1_p5_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p5_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v4, 8, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v6, 8, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v8, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v10, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v12, 8, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v14, 8, v13
-; CHECK-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v16, 8, v15
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v2, 8, v17
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
-; CHECK-NEXT:    v_lshl_or_b32 v5, v9, 16, v6
-; CHECK-NEXT:    v_lshl_or_b32 v4, v11, 16, v10
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 16, i1 false)
@@ -3831,82 +2078,24 @@ define void @memmove_p1_p5_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p5_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v4, 8, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v6, 8, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    v_lshlrev_b16 v4, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v9, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v11, 8, v10
-; CHECK-NEXT:    v_lshl_or_b32 v8, v3, 16, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v13, 8, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v15, 8, v14
-; CHECK-NEXT:    v_lshl_or_b32 v2, v9, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v17, 8, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v19, 8, v18
-; CHECK-NEXT:    v_lshl_or_b32 v3, v6, 16, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v21, 8, v20
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v23, 8, v22
+; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    s_waitcnt vmcnt(8)
+; CHECK-NEXT:    global_store_short v[0:1], v10, off offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v26, 8, v25
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v24, 8, v28
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v29, 8, v27
-; CHECK-NEXT:    v_lshl_or_b32 v5, v11, 16, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v31, 8, v30
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    v_or_b32_e32 v18, v4, v32
-; CHECK-NEXT:    v_lshl_or_b32 v4, v13, 16, v12
-; CHECK-NEXT:    v_lshl_or_b32 v7, v15, 16, v14
-; CHECK-NEXT:    v_lshl_or_b32 v6, v16, 16, v17
+; CHECK-NEXT:    global_store_byte v[0:1], v11, off offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(3)
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_byte v[0:1], v33, off offset:30
-; CHECK-NEXT:    global_store_short v[0:1], v18, off offset:28
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; CHECK-NEXT:    global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; CHECK-NEXT:    global_store_dwordx3 v[0:1], v[7:9], off offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -3917,81 +2106,19 @@ define void @memmove_p1_p5_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p5_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v4, 8, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v6, 8, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v8, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v10, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v12, 8, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v14, 8, v13
-; CHECK-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v16, 8, v15
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v18, 8, v17
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v21, 8, v20
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v23, 8, v22
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v19, 8, v25
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v26, 8, v24
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v28, 8, v27
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v30, 8, v29
-; CHECK-NEXT:    v_lshl_or_b32 v7, v13, 16, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v18, v32, 8, v31
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v19, v2, 8, v33
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
-; CHECK-NEXT:    v_lshl_or_b32 v5, v9, 16, v6
-; CHECK-NEXT:    v_lshl_or_b32 v4, v11, 16, v10
-; CHECK-NEXT:    v_lshl_or_b32 v6, v14, 16, v15
-; CHECK-NEXT:    v_lshl_or_b32 v9, v17, 16, v16
-; CHECK-NEXT:    v_lshl_or_b32 v8, v19, 16, v18
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 32, i1 false)
@@ -4002,24 +2129,13 @@ define void @memmove_p1_p5_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p5_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    buffer_load_ushort v4, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v7, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v8, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v9, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v6, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v8, 16, v7
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v10, 16, v9
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 16, i1 false)
@@ -4030,43 +2146,24 @@ define void @memmove_p1_p5_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p5_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ushort v3, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v4, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ushort v5, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ushort v6, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v7, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v9, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v12, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v13, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v14, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v15, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v16, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v17, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ushort v18, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v4, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v9, 16, v7
+; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v11, 16, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v13, 16, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v15, 16, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v17, 16, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    global_store_short v[0:1], v18, off offset:28
+; CHECK-NEXT:    global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(7)
+; CHECK-NEXT:    global_store_byte v[0:1], v11, off offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(3)
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_byte v[0:1], v19, off offset:30
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; CHECK-NEXT:    global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; CHECK-NEXT:    global_store_dwordx3 v[0:1], v[7:9], off offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -4077,41 +2174,19 @@ define void @memmove_p1_p5_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p5_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ushort v4, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v7, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v8, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v9, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v12, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ushort v13, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v14, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v15, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ushort v16, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ushort v17, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v18, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v6, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v8, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v10, 16, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v12, 16, v11
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v14, 16, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v16, 16, v15
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v18, 16, v17
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 32, i1 false)
@@ -4258,41 +2333,8 @@ define void @memmove_p3_p0_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p0_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:5
-; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:7
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2]
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:1
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:3
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:13
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:9
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:11
-; CHECK-NEXT:    flat_load_ubyte v1, v[1:2] offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 8, v9
-; CHECK-NEXT:    v_lshl_or_b32 v3, v5, 8, v4
-; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 8, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v10, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v11, 8, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v13, 8, v14
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 8, v16
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v17, 8, v1
-; CHECK-NEXT:    v_lshl_or_b32 v1, v5, 16, v4
-; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 16, v6
-; CHECK-NEXT:    v_lshl_or_b32 v3, v9, 16, v8
 ; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -4305,82 +2347,20 @@ define void @memmove_p3_p0_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p0_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:13
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:23
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:21
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:29
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:27
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:25
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:19
-; CHECK-NEXT:    flat_load_ubyte v18, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ubyte v19, v[1:2] offset:17
-; CHECK-NEXT:    flat_load_ubyte v20, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ubyte v21, v[1:2] offset:11
-; CHECK-NEXT:    flat_load_ubyte v22, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ubyte v23, v[1:2] offset:9
-; CHECK-NEXT:    flat_load_ubyte v24, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ubyte v25, v[1:2] offset:7
-; CHECK-NEXT:    flat_load_ubyte v26, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ubyte v27, v[1:2] offset:5
-; CHECK-NEXT:    flat_load_ubyte v28, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ubyte v29, v[1:2] offset:1
-; CHECK-NEXT:    flat_load_ubyte v30, v[1:2]
-; CHECK-NEXT:    flat_load_ubyte v31, v[1:2] offset:3
-; CHECK-NEXT:    flat_load_ubyte v32, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ubyte v1, v[1:2] offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(29) lgkmcnt(29)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v3, 8, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(27) lgkmcnt(27)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v5, 8, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(25) lgkmcnt(25)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v7, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(23) lgkmcnt(23)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v9, 8, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(22) lgkmcnt(22)
-; CHECK-NEXT:    v_lshlrev_b16 v11, 8, v11
-; CHECK-NEXT:    v_lshl_or_b32 v4, v3, 16, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(19) lgkmcnt(19)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v13, 8, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(17) lgkmcnt(17)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v15, 8, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v17, 8, v18
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v19, 8, v20
-; CHECK-NEXT:    v_lshl_or_b32 v13, v13, 16, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v21, 8, v22
-; CHECK-NEXT:    v_lshl_or_b32 v2, v7, 16, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v23, 8, v24
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v25, 8, v26
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v27, 8, v28
-; CHECK-NEXT:    v_lshl_or_b32 v3, v14, 16, v6
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:30
+; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:28
+; CHECK-NEXT:    flat_load_dwordx3 v[5:7], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v29, 8, v30
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    v_lshl_or_b32 v18, v31, 8, v32
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_or_b32_e32 v11, v11, v1
-; CHECK-NEXT:    v_lshl_or_b32 v1, v10, 16, v8
-; CHECK-NEXT:    v_lshl_or_b32 v6, v16, 16, v15
-; CHECK-NEXT:    v_lshl_or_b32 v5, v18, 16, v17
-; CHECK-NEXT:    ds_write_b8 v0, v12 offset:30
-; CHECK-NEXT:    ds_write_b32 v0, v13 offset:24
-; CHECK-NEXT:    ds_write_b16 v0, v11 offset:28
-; CHECK-NEXT:    ds_write_b64 v0, v[1:2] offset:16
-; CHECK-NEXT:    ds_write2_b64 v0, v[5:6], v[3:4] offset1:1
+; CHECK-NEXT:    ds_write_b8 v0, v8 offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(3)
+; CHECK-NEXT:    ds_write_b16 v0, v9 offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(3)
+; CHECK-NEXT:    ds_write_b32 v0, v7 offset:24
+; CHECK-NEXT:    ds_write_b64 v0, v[5:6] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(4)
+; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -4392,79 +2372,13 @@ define void @memmove_p3_p0_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p0_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:29
-; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:31
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:25
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:27
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:13
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:23
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:21
-; CHECK-NEXT:    flat_load_ubyte v18, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ubyte v19, v[1:2] offset:19
-; CHECK-NEXT:    flat_load_ubyte v20, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ubyte v21, v[1:2] offset:17
-; CHECK-NEXT:    flat_load_ubyte v22, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ubyte v23, v[1:2] offset:11
-; CHECK-NEXT:    flat_load_ubyte v24, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ubyte v25, v[1:2] offset:9
-; CHECK-NEXT:    flat_load_ubyte v26, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ubyte v27, v[1:2] offset:7
-; CHECK-NEXT:    flat_load_ubyte v28, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ubyte v29, v[1:2] offset:5
-; CHECK-NEXT:    flat_load_ubyte v30, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ubyte v31, v[1:2] offset:1
-; CHECK-NEXT:    flat_load_ubyte v32, v[1:2]
-; CHECK-NEXT:    flat_load_ubyte v33, v[1:2] offset:3
-; CHECK-NEXT:    flat_load_ubyte v1, v[1:2] offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(25) lgkmcnt(25)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 8, v9
-; CHECK-NEXT:    v_lshl_or_b32 v3, v5, 8, v4
-; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 8, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(24) lgkmcnt(24)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v10, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(22) lgkmcnt(22)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v11, 8, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(20) lgkmcnt(20)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v13, 8, v14
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(18) lgkmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v15, 8, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v17, 8, v18
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v19, 8, v20
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v21, 8, v22
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v23, 8, v24
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v25, 8, v26
-; CHECK-NEXT:    v_lshl_or_b32 v3, v9, 16, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v27, 8, v28
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v29, 8, v30
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v31, 8, v32
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v33, 8, v1
-; CHECK-NEXT:    v_lshl_or_b32 v1, v5, 16, v4
-; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 16, v6
-; CHECK-NEXT:    v_lshl_or_b32 v6, v11, 16, v10
-; CHECK-NEXT:    v_lshl_or_b32 v5, v13, 16, v12
-; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 16, v14
-; CHECK-NEXT:    v_lshl_or_b32 v7, v17, 16, v16
-; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[1:2] offset0:2 offset1:3
-; CHECK-NEXT:    ds_write2_b64 v0, v[7:8], v[5:6] offset1:1
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[5:6] offset0:2 offset1:3
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT:    ds_write2_b64 v0, v[7:8], v[9:10] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -4476,23 +2390,8 @@ define void @memmove_p3_p0_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p0_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    flat_load_ushort v3, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ushort v4, v[1:2]
-; CHECK-NEXT:    flat_load_ushort v5, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ushort v6, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ushort v7, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ushort v8, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ushort v10, v[1:2] offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v8, 16, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v9, 16, v5
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v10, 16, v6
 ; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -4505,43 +2404,20 @@ define void @memmove_p3_p0_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p0_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ushort v3, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ushort v4, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ushort v5, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ushort v6, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ushort v7, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ushort v8, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ushort v10, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ushort v11, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ushort v12, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ushort v13, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ushort v14, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ushort v15, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ushort v16, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ushort v17, v[1:2]
-; CHECK-NEXT:    flat_load_ubyte v18, v[1:2] offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v3, 16, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v5, 16, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v8, 16, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v10, 16, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v12, 16, v13
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:30
+; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:28
+; CHECK-NEXT:    flat_load_dwordx3 v[5:7], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v14, 16, v15
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v16, 16, v17
-; CHECK-NEXT:    ds_write_b16 v0, v7 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
-; CHECK-NEXT:    ds_write_b8 v0, v18 offset:30
-; CHECK-NEXT:    ds_write_b32 v0, v8 offset:24
-; CHECK-NEXT:    ds_write_b64 v0, v[1:2] offset:16
-; CHECK-NEXT:    ds_write2_b64 v0, v[5:6], v[3:4] offset1:1
+; CHECK-NEXT:    ds_write_b8 v0, v8 offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(3)
+; CHECK-NEXT:    ds_write_b16 v0, v9 offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(3)
+; CHECK-NEXT:    ds_write_b32 v0, v7 offset:24
+; CHECK-NEXT:    ds_write_b64 v0, v[5:6] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(4)
+; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -4553,40 +2429,13 @@ define void @memmove_p3_p0_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p0_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ushort v3, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ushort v4, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ushort v5, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ushort v6, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ushort v7, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ushort v8, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ushort v10, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ushort v11, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ushort v12, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ushort v13, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ushort v14, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ushort v15, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ushort v16, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ushort v17, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ushort v18, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v6, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v5, 16, v10
-; CHECK-NEXT:    v_lshl_or_b32 v5, v8, 16, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v11, 16, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v13, 16, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 16, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v17, 16, v18
-; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[1:2] offset0:2 offset1:3
-; CHECK-NEXT:    ds_write2_b64 v0, v[7:8], v[5:6] offset1:1
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[5:6] offset0:2 offset1:3
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT:    ds_write2_b64 v0, v[7:8], v[9:10] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -5398,44 +3247,13 @@ define void @memmove_p3_p5_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p5_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v3, 8, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v5, 8, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v7, 8, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v9, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v11, 8, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v13, 8, v12
-; CHECK-NEXT:    v_lshl_or_b32 v2, v7, 16, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v15, 8, v14
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v1, 8, v16
-; CHECK-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; CHECK-NEXT:    v_lshl_or_b32 v4, v8, 16, v5
-; CHECK-NEXT:    v_lshl_or_b32 v3, v10, 16, v9
-; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -5447,83 +3265,26 @@ define void @memmove_p3_p5_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p5_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v3, 8, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 8, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    v_lshlrev_b16 v3, 8, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v8, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v10, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v12, 8, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v14, 8, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v16, 8, v15
-; CHECK-NEXT:    v_lshl_or_b32 v16, v2, 16, v1
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v18, 8, v17
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v20, 8, v19
-; CHECK-NEXT:    v_lshl_or_b32 v1, v7, 16, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v22, 8, v21
+; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_ushort v9, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    s_waitcnt vmcnt(8)
+; CHECK-NEXT:    ds_write_b32 v0, v8 offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v25, 8, v24
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v23, 8, v27
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v28, 8, v26
-; CHECK-NEXT:    v_lshl_or_b32 v4, v9, 16, v8
+; CHECK-NEXT:    ds_write_b16 v0, v9 offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(6)
+; CHECK-NEXT:    ds_write_b8 v0, v10 offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v30, 8, v29
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    v_or_b32_e32 v17, v3, v31
-; CHECK-NEXT:    v_lshl_or_b32 v3, v11, 16, v10
-; CHECK-NEXT:    v_lshl_or_b32 v6, v13, 16, v12
-; CHECK-NEXT:    v_lshl_or_b32 v5, v14, 16, v15
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    ds_write_b8 v0, v32 offset:30
-; CHECK-NEXT:    ds_write_b32 v0, v16 offset:24
-; CHECK-NEXT:    ds_write_b16 v0, v17 offset:28
-; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
-; CHECK-NEXT:    ds_write_b64 v0, v[5:6] offset:16
+; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ds_write_b64 v0, v[6:7] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -5535,81 +3296,19 @@ define void @memmove_p3_p5_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p5_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v3, 8, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v5, 8, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v7, 8, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v9, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v11, 8, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v13, 8, v12
-; CHECK-NEXT:    v_lshl_or_b32 v2, v7, 16, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v15, 8, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v17, 8, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v20, 8, v19
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v22, 8, v21
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v18, 8, v24
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v25, 8, v23
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v27, 8, v26
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v29, 8, v28
-; CHECK-NEXT:    v_lshl_or_b32 v6, v12, 16, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v31, 8, v30
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v18, v1, 8, v32
-; CHECK-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; CHECK-NEXT:    v_lshl_or_b32 v4, v8, 16, v5
-; CHECK-NEXT:    v_lshl_or_b32 v3, v10, 16, v9
-; CHECK-NEXT:    v_lshl_or_b32 v5, v13, 16, v14
-; CHECK-NEXT:    v_lshl_or_b32 v8, v16, 16, v15
-; CHECK-NEXT:    v_lshl_or_b32 v7, v18, 16, v17
-; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
-; CHECK-NEXT:    ds_write2_b64 v0, v[5:6], v[7:8] offset0:2 offset1:3
+; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ds_write2_b64 v0, v[6:7], v[8:9] offset0:2 offset1:3
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -5621,24 +3320,13 @@ define void @memmove_p3_p5_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p5_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    buffer_load_ushort v3, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v7, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v8, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v9, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 16, v6
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v9, 16, v8
-; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -5650,44 +3338,26 @@ define void @memmove_p3_p5_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p5_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v3, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ushort v4, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ushort v5, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v7, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v8, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v9, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v10, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v11, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v12, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v13, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v14, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v15, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ushort v16, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v18, v3, 16, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v5, 16, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v7, 16, v6
+; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_ushort v9, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v9, 16, v8
+; CHECK-NEXT:    ds_write_b32 v0, v8 offset:24
+; CHECK-NEXT:    s_waitcnt vmcnt(7)
+; CHECK-NEXT:    ds_write_b16 v0, v9 offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v11, 16, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v13, 16, v12
+; CHECK-NEXT:    ds_write_b8 v0, v10 offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v15, 16, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    ds_write_b16 v0, v16 offset:28
+; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    ds_write_b8 v0, v17 offset:30
-; CHECK-NEXT:    ds_write_b32 v0, v18 offset:24
-; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
-; CHECK-NEXT:    ds_write_b64 v0, v[5:6] offset:16
+; CHECK-NEXT:    ds_write_b64 v0, v[6:7] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -5699,41 +3369,19 @@ define void @memmove_p3_p5_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p5_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ushort v3, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v7, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v8, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v9, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v10, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v11, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ushort v12, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v13, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v14, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ushort v15, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ushort v16, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v17, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v7, 16, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v9, 16, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v11, 16, v10
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v13, 16, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 16, v14
+; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v17, 16, v16
-; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
-; CHECK-NEXT:    ds_write2_b64 v0, v[5:6], v[7:8] offset0:2 offset1:3
+; CHECK-NEXT:    ds_write2_b64 v0, v[6:7], v[8:9] offset0:2 offset1:3
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -5890,55 +3538,12 @@ define void @memmove_p5_p0_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p0_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:13
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:11
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:9
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:7
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:5
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:3
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:1
-; CHECK-NEXT:    flat_load_ubyte v1, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 16, i1 false)
@@ -5949,100 +3554,24 @@ define void @memmove_p5_p0_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p0_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:29
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:27
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:25
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:23
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:21
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:19
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:17
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ubyte v18, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v19, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ubyte v20, v[1:2] offset:13
-; CHECK-NEXT:    flat_load_ubyte v21, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ubyte v22, v[1:2] offset:11
-; CHECK-NEXT:    flat_load_ubyte v23, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ubyte v24, v[1:2] offset:9
-; CHECK-NEXT:    flat_load_ubyte v25, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ubyte v26, v[1:2] offset:7
-; CHECK-NEXT:    flat_load_ubyte v27, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ubyte v28, v[1:2] offset:5
-; CHECK-NEXT:    flat_load_ubyte v29, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ubyte v30, v[1:2] offset:3
-; CHECK-NEXT:    flat_load_ubyte v31, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ubyte v32, v[1:2] offset:1
-; CHECK-NEXT:    flat_load_ubyte v1, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(30) lgkmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29) lgkmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28) lgkmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27) lgkmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26) lgkmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25) lgkmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24) lgkmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23) lgkmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22) lgkmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21) lgkmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20) lgkmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19) lgkmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18) lgkmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17) lgkmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:30
+; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:28
+; CHECK-NEXT:    flat_load_dwordx3 v[5:7], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -6053,103 +3582,19 @@ define void @memmove_p5_p0_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p0_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:31
-; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:29
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:27
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:25
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:23
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:21
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:19
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:17
-; CHECK-NEXT:    flat_load_ubyte v18, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ubyte v19, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v20, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ubyte v21, v[1:2] offset:13
-; CHECK-NEXT:    flat_load_ubyte v22, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ubyte v23, v[1:2] offset:11
-; CHECK-NEXT:    flat_load_ubyte v24, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ubyte v25, v[1:2] offset:9
-; CHECK-NEXT:    flat_load_ubyte v26, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ubyte v27, v[1:2] offset:7
-; CHECK-NEXT:    flat_load_ubyte v28, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ubyte v29, v[1:2] offset:5
-; CHECK-NEXT:    flat_load_ubyte v30, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ubyte v31, v[1:2] offset:3
-; CHECK-NEXT:    flat_load_ubyte v32, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ubyte v33, v[1:2] offset:1
-; CHECK-NEXT:    flat_load_ubyte v1, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(31) lgkmcnt(31)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(30) lgkmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29) lgkmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28) lgkmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27) lgkmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26) lgkmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25) lgkmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24) lgkmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23) lgkmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22) lgkmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21) lgkmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20) lgkmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19) lgkmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18) lgkmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17) lgkmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v33, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 32, i1 false)
@@ -6160,31 +3605,12 @@ define void @memmove_p5_p0_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p0_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    flat_load_ushort v3, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ushort v4, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ushort v5, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ushort v6, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ushort v7, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ushort v8, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ushort v1, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 16, i1 false)
@@ -6195,55 +3621,24 @@ define void @memmove_p5_p0_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p0_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ushort v4, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ushort v5, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ushort v6, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ushort v7, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ushort v8, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ushort v10, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ushort v11, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ushort v12, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ushort v13, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ushort v14, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ushort v15, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ushort v16, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ushort v17, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ushort v1, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:30
+; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:28
+; CHECK-NEXT:    flat_load_dwordx3 v[5:7], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -6254,55 +3649,19 @@ define void @memmove_p5_p0_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p0_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ushort v3, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ushort v4, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ushort v5, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ushort v6, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ushort v7, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ushort v8, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ushort v10, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ushort v11, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ushort v12, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ushort v13, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ushort v14, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ushort v15, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ushort v16, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ushort v17, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ushort v1, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 32, i1 false)
@@ -6452,55 +3811,12 @@ define void @memmove_p5_p1_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p1_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:15
-; CHECK-NEXT:    global_load_ubyte v4, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off offset:13
-; CHECK-NEXT:    global_load_ubyte v6, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ubyte v7, v[1:2], off offset:11
-; CHECK-NEXT:    global_load_ubyte v8, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:9
-; CHECK-NEXT:    global_load_ubyte v10, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ubyte v11, v[1:2], off offset:7
-; CHECK-NEXT:    global_load_ubyte v12, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ubyte v13, v[1:2], off offset:5
-; CHECK-NEXT:    global_load_ubyte v14, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ubyte v15, v[1:2], off offset:3
-; CHECK-NEXT:    global_load_ubyte v16, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ubyte v17, v[1:2], off offset:1
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 16, i1 false)
@@ -6511,100 +3827,24 @@ define void @memmove_p5_p1_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p1_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ubyte v4, v[1:2], off offset:29
-; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v6, v[1:2], off offset:27
-; CHECK-NEXT:    global_load_ubyte v7, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ubyte v8, v[1:2], off offset:25
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ubyte v10, v[1:2], off offset:23
-; CHECK-NEXT:    global_load_ubyte v11, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ubyte v12, v[1:2], off offset:21
-; CHECK-NEXT:    global_load_ubyte v13, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ubyte v14, v[1:2], off offset:19
-; CHECK-NEXT:    global_load_ubyte v15, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ubyte v16, v[1:2], off offset:17
-; CHECK-NEXT:    global_load_ubyte v17, v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ubyte v18, v[1:2], off offset:15
-; CHECK-NEXT:    global_load_ubyte v19, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ubyte v20, v[1:2], off offset:13
-; CHECK-NEXT:    global_load_ubyte v21, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ubyte v22, v[1:2], off offset:11
-; CHECK-NEXT:    global_load_ubyte v23, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ubyte v24, v[1:2], off offset:9
-; CHECK-NEXT:    global_load_ubyte v25, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ubyte v26, v[1:2], off offset:7
-; CHECK-NEXT:    global_load_ubyte v27, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ubyte v28, v[1:2], off offset:5
-; CHECK-NEXT:    global_load_ubyte v29, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ubyte v30, v[1:2], off offset:3
-; CHECK-NEXT:    global_load_ubyte v31, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ubyte v32, v[1:2], off offset:1
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    global_load_dwordx3 v[5:7], v[1:2], off offset:16
+; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:28
+; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:30
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -6615,103 +3855,19 @@ define void @memmove_p5_p1_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p1_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:31
-; CHECK-NEXT:    global_load_ubyte v4, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off offset:29
-; CHECK-NEXT:    global_load_ubyte v6, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v7, v[1:2], off offset:27
-; CHECK-NEXT:    global_load_ubyte v8, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:25
-; CHECK-NEXT:    global_load_ubyte v10, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ubyte v11, v[1:2], off offset:23
-; CHECK-NEXT:    global_load_ubyte v12, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ubyte v13, v[1:2], off offset:21
-; CHECK-NEXT:    global_load_ubyte v14, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ubyte v15, v[1:2], off offset:19
-; CHECK-NEXT:    global_load_ubyte v16, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ubyte v17, v[1:2], off offset:17
-; CHECK-NEXT:    global_load_ubyte v18, v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ubyte v19, v[1:2], off offset:15
-; CHECK-NEXT:    global_load_ubyte v20, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ubyte v21, v[1:2], off offset:13
-; CHECK-NEXT:    global_load_ubyte v22, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ubyte v23, v[1:2], off offset:11
-; CHECK-NEXT:    global_load_ubyte v24, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ubyte v25, v[1:2], off offset:9
-; CHECK-NEXT:    global_load_ubyte v26, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ubyte v27, v[1:2], off offset:7
-; CHECK-NEXT:    global_load_ubyte v28, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ubyte v29, v[1:2], off offset:5
-; CHECK-NEXT:    global_load_ubyte v30, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ubyte v31, v[1:2], off offset:3
-; CHECK-NEXT:    global_load_ubyte v32, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ubyte v33, v[1:2], off offset:1
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(31)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v33, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 32, i1 false)
@@ -6722,31 +3878,12 @@ define void @memmove_p5_p1_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p1_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    global_load_ushort v3, v[1:2], off
-; CHECK-NEXT:    global_load_ushort v4, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ushort v5, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ushort v6, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ushort v7, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ushort v9, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ushort v1, v[1:2], off offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 16, i1 false)
@@ -6757,55 +3894,24 @@ define void @memmove_p5_p1_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p1_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ushort v4, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ushort v5, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ushort v6, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ushort v7, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ushort v9, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ushort v10, v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v11, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ushort v12, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ushort v13, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ushort v14, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ushort v15, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ushort v16, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ushort v17, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ushort v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    global_load_dwordx3 v[5:7], v[1:2], off offset:16
+; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:28
+; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:30
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -6816,55 +3922,19 @@ define void @memmove_p5_p1_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p1_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ushort v3, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ushort v4, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ushort v5, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ushort v6, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ushort v7, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ushort v9, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ushort v10, v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v11, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ushort v12, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ushort v13, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ushort v14, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ushort v15, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ushort v16, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ushort v17, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ushort v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 32, i1 false)
@@ -7009,54 +4079,12 @@ define void @memmove_p5_p3_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p3_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v2, v1 offset:15
-; CHECK-NEXT:    ds_read_u8 v3, v1 offset:14
-; CHECK-NEXT:    ds_read_u8 v4, v1 offset:13
-; CHECK-NEXT:    ds_read_u8 v5, v1 offset:12
-; CHECK-NEXT:    ds_read_u8 v6, v1 offset:11
-; CHECK-NEXT:    ds_read_u8 v7, v1 offset:10
-; CHECK-NEXT:    ds_read_u8 v8, v1 offset:9
-; CHECK-NEXT:    ds_read_u8 v9, v1 offset:8
-; CHECK-NEXT:    ds_read_u8 v10, v1 offset:7
-; CHECK-NEXT:    ds_read_u8 v11, v1 offset:6
-; CHECK-NEXT:    ds_read_u8 v12, v1 offset:5
-; CHECK-NEXT:    ds_read_u8 v13, v1 offset:4
-; CHECK-NEXT:    ds_read_u8 v14, v1 offset:3
-; CHECK-NEXT:    ds_read_u8 v15, v1 offset:2
-; CHECK-NEXT:    ds_read_u8 v16, v1 offset:1
-; CHECK-NEXT:    ds_read_u8 v1, v1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    ds_read2_b64 v[1:4], v1 offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 16, i1 false)
@@ -7067,72 +4095,25 @@ define void @memmove_p5_p3_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p3_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v2, v1 offset:24
-; CHECK-NEXT:    ds_read_u8 v3, v1 offset:25
-; CHECK-NEXT:    ds_read_u8 v4, v1 offset:26
-; CHECK-NEXT:    ds_read_u8 v5, v1 offset:27
-; CHECK-NEXT:    ds_read_u8 v6, v1 offset:28
-; CHECK-NEXT:    ds_read_u8 v7, v1 offset:29
-; CHECK-NEXT:    ds_read_u8 v8, v1 offset:30
-; CHECK-NEXT:    ds_read_u8 v9, v1 offset:16
-; CHECK-NEXT:    ds_read_u8 v10, v1 offset:17
-; CHECK-NEXT:    ds_read_u8 v11, v1 offset:18
-; CHECK-NEXT:    ds_read_u8 v12, v1 offset:19
-; CHECK-NEXT:    ds_read_u8 v13, v1 offset:20
-; CHECK-NEXT:    ds_read_u8 v14, v1 offset:21
-; CHECK-NEXT:    ds_read_u8 v15, v1 offset:22
-; CHECK-NEXT:    ds_read_u8 v16, v1 offset:23
-; CHECK-NEXT:    ds_read_u8 v17, v1 offset:8
-; CHECK-NEXT:    ds_read_u8 v18, v1 offset:9
-; CHECK-NEXT:    ds_read_u8 v19, v1 offset:10
-; CHECK-NEXT:    ds_read_u8 v20, v1 offset:11
-; CHECK-NEXT:    ds_read_u8 v21, v1 offset:12
-; CHECK-NEXT:    ds_read_u8 v22, v1 offset:13
-; CHECK-NEXT:    ds_read_u8 v23, v1 offset:14
-; CHECK-NEXT:    ds_read_u8 v24, v1 offset:15
-; CHECK-NEXT:    ds_read_u8 v25, v1
-; CHECK-NEXT:    ds_read_u8 v26, v1 offset:1
-; CHECK-NEXT:    ds_read_u8 v27, v1 offset:2
-; CHECK-NEXT:    ds_read_u8 v28, v1 offset:3
-; CHECK-NEXT:    ds_read_u8 v29, v1 offset:4
-; CHECK-NEXT:    ds_read_u8 v30, v1 offset:5
-; CHECK-NEXT:    ds_read_u8 v31, v1 offset:6
-; CHECK-NEXT:    ds_read_u8 v1, v1 offset:7
-; CHECK-NEXT:    s_waitcnt lgkmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen
+; CHECK-NEXT:    ds_read_b32 v8, v1 offset:24
+; CHECK-NEXT:    ds_read_u16 v9, v1 offset:28
+; CHECK-NEXT:    ds_read_u8 v10, v1 offset:30
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v1 offset1:1
+; CHECK-NEXT:    ds_read_b64 v[6:7], v1 offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
+; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -7143,74 +4124,18 @@ define void @memmove_p5_p3_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p3_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v2, v1 offset:24
-; CHECK-NEXT:    ds_read_u8 v3, v1 offset:25
-; CHECK-NEXT:    ds_read_u8 v4, v1 offset:26
-; CHECK-NEXT:    ds_read_u8 v5, v1 offset:27
-; CHECK-NEXT:    ds_read_u8 v6, v1 offset:28
-; CHECK-NEXT:    ds_read_u8 v7, v1 offset:29
-; CHECK-NEXT:    ds_read_u8 v8, v1 offset:30
-; CHECK-NEXT:    ds_read_u8 v9, v1 offset:31
-; CHECK-NEXT:    ds_read_u8 v10, v1 offset:16
-; CHECK-NEXT:    ds_read_u8 v11, v1 offset:17
-; CHECK-NEXT:    ds_read_u8 v12, v1 offset:18
-; CHECK-NEXT:    ds_read_u8 v13, v1 offset:19
-; CHECK-NEXT:    ds_read_u8 v14, v1 offset:20
-; CHECK-NEXT:    ds_read_u8 v15, v1 offset:21
-; CHECK-NEXT:    ds_read_u8 v16, v1 offset:22
-; CHECK-NEXT:    ds_read_u8 v17, v1 offset:23
-; CHECK-NEXT:    ds_read_u8 v18, v1 offset:8
-; CHECK-NEXT:    ds_read_u8 v19, v1 offset:9
-; CHECK-NEXT:    ds_read_u8 v20, v1 offset:10
-; CHECK-NEXT:    ds_read_u8 v21, v1 offset:11
-; CHECK-NEXT:    ds_read_u8 v22, v1 offset:12
-; CHECK-NEXT:    ds_read_u8 v23, v1 offset:13
-; CHECK-NEXT:    ds_read_u8 v24, v1 offset:14
-; CHECK-NEXT:    ds_read_u8 v25, v1 offset:15
-; CHECK-NEXT:    ds_read_u8 v26, v1
-; CHECK-NEXT:    ds_read_u8 v27, v1 offset:1
-; CHECK-NEXT:    ds_read_u8 v28, v1 offset:2
-; CHECK-NEXT:    ds_read_u8 v29, v1 offset:3
-; CHECK-NEXT:    ds_read_u8 v30, v1 offset:4
-; CHECK-NEXT:    ds_read_u8 v31, v1 offset:5
-; CHECK-NEXT:    ds_read_u8 v32, v1 offset:6
-; CHECK-NEXT:    ds_read_u8 v1, v1 offset:7
-; CHECK-NEXT:    s_waitcnt lgkmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v1 offset1:1
+; CHECK-NEXT:    ds_read2_b64 v[6:9], v1 offset0:2 offset1:3
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 32, i1 false)
@@ -7221,30 +4146,12 @@ define void @memmove_p5_p3_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p3_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u16 v2, v1
-; CHECK-NEXT:    ds_read_u16 v3, v1 offset:2
-; CHECK-NEXT:    ds_read_u16 v4, v1 offset:4
-; CHECK-NEXT:    ds_read_u16 v5, v1 offset:6
-; CHECK-NEXT:    ds_read_u16 v6, v1 offset:8
-; CHECK-NEXT:    ds_read_u16 v7, v1 offset:10
-; CHECK-NEXT:    ds_read_u16 v8, v1 offset:12
-; CHECK-NEXT:    ds_read_u16 v1, v1 offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    ds_read2_b64 v[1:4], v1 offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 16, i1 false)
@@ -7255,54 +4162,25 @@ define void @memmove_p5_p3_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p3_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v2, v1 offset:30
-; CHECK-NEXT:    ds_read_u16 v3, v1 offset:28
-; CHECK-NEXT:    ds_read_u16 v4, v1 offset:26
-; CHECK-NEXT:    ds_read_u16 v5, v1 offset:24
-; CHECK-NEXT:    ds_read_u16 v6, v1 offset:22
-; CHECK-NEXT:    ds_read_u16 v7, v1 offset:20
-; CHECK-NEXT:    ds_read_u16 v8, v1 offset:18
-; CHECK-NEXT:    ds_read_u16 v9, v1 offset:16
-; CHECK-NEXT:    ds_read_u16 v10, v1 offset:14
-; CHECK-NEXT:    ds_read_u16 v11, v1 offset:12
-; CHECK-NEXT:    ds_read_u16 v12, v1 offset:10
-; CHECK-NEXT:    ds_read_u16 v13, v1 offset:8
-; CHECK-NEXT:    ds_read_u16 v14, v1 offset:6
-; CHECK-NEXT:    ds_read_u16 v15, v1 offset:4
-; CHECK-NEXT:    ds_read_u16 v16, v1 offset:2
-; CHECK-NEXT:    ds_read_u16 v1, v1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT:    ds_read_b32 v8, v1 offset:24
+; CHECK-NEXT:    ds_read_u16 v9, v1 offset:28
+; CHECK-NEXT:    ds_read_u8 v10, v1 offset:30
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v1 offset1:1
+; CHECK-NEXT:    ds_read_b64 v[6:7], v1 offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -7313,54 +4191,18 @@ define void @memmove_p5_p3_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p3_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u16 v2, v1 offset:30
-; CHECK-NEXT:    ds_read_u16 v3, v1 offset:28
-; CHECK-NEXT:    ds_read_u16 v4, v1 offset:26
-; CHECK-NEXT:    ds_read_u16 v5, v1 offset:24
-; CHECK-NEXT:    ds_read_u16 v6, v1 offset:22
-; CHECK-NEXT:    ds_read_u16 v7, v1 offset:20
-; CHECK-NEXT:    ds_read_u16 v8, v1 offset:18
-; CHECK-NEXT:    ds_read_u16 v9, v1 offset:16
-; CHECK-NEXT:    ds_read_u16 v10, v1 offset:14
-; CHECK-NEXT:    ds_read_u16 v11, v1 offset:12
-; CHECK-NEXT:    ds_read_u16 v12, v1 offset:10
-; CHECK-NEXT:    ds_read_u16 v13, v1 offset:8
-; CHECK-NEXT:    ds_read_u16 v14, v1 offset:6
-; CHECK-NEXT:    ds_read_u16 v15, v1 offset:4
-; CHECK-NEXT:    ds_read_u16 v16, v1 offset:2
-; CHECK-NEXT:    ds_read_u16 v1, v1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v1 offset1:1
+; CHECK-NEXT:    ds_read2_b64 v[6:9], v1 offset0:2 offset1:3
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 32, i1 false)
@@ -7505,55 +4347,12 @@ define void @memmove_p5_p4_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p4_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:15
-; CHECK-NEXT:    global_load_ubyte v4, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off offset:13
-; CHECK-NEXT:    global_load_ubyte v6, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ubyte v7, v[1:2], off offset:11
-; CHECK-NEXT:    global_load_ubyte v8, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:9
-; CHECK-NEXT:    global_load_ubyte v10, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ubyte v11, v[1:2], off offset:7
-; CHECK-NEXT:    global_load_ubyte v12, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ubyte v13, v[1:2], off offset:5
-; CHECK-NEXT:    global_load_ubyte v14, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ubyte v15, v[1:2], off offset:3
-; CHECK-NEXT:    global_load_ubyte v16, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ubyte v17, v[1:2], off offset:1
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false)
@@ -7564,100 +4363,24 @@ define void @memmove_p5_p4_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p4_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ubyte v4, v[1:2], off offset:29
-; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v6, v[1:2], off offset:27
-; CHECK-NEXT:    global_load_ubyte v7, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ubyte v8, v[1:2], off offset:25
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ubyte v10, v[1:2], off offset:23
-; CHECK-NEXT:    global_load_ubyte v11, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ubyte v12, v[1:2], off offset:21
-; CHECK-NEXT:    global_load_ubyte v13, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ubyte v14, v[1:2], off offset:19
-; CHECK-NEXT:    global_load_ubyte v15, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ubyte v16, v[1:2], off offset:17
-; CHECK-NEXT:    global_load_ubyte v17, v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ubyte v18, v[1:2], off offset:15
-; CHECK-NEXT:    global_load_ubyte v19, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ubyte v20, v[1:2], off offset:13
-; CHECK-NEXT:    global_load_ubyte v21, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ubyte v22, v[1:2], off offset:11
-; CHECK-NEXT:    global_load_ubyte v23, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ubyte v24, v[1:2], off offset:9
-; CHECK-NEXT:    global_load_ubyte v25, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ubyte v26, v[1:2], off offset:7
-; CHECK-NEXT:    global_load_ubyte v27, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ubyte v28, v[1:2], off offset:5
-; CHECK-NEXT:    global_load_ubyte v29, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ubyte v30, v[1:2], off offset:3
-; CHECK-NEXT:    global_load_ubyte v31, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ubyte v32, v[1:2], off offset:1
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    global_load_dwordx3 v[5:7], v[1:2], off offset:16
+; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:28
+; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:30
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -7668,103 +4391,19 @@ define void @memmove_p5_p4_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p4_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:31
-; CHECK-NEXT:    global_load_ubyte v4, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off offset:29
-; CHECK-NEXT:    global_load_ubyte v6, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v7, v[1:2], off offset:27
-; CHECK-NEXT:    global_load_ubyte v8, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:25
-; CHECK-NEXT:    global_load_ubyte v10, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ubyte v11, v[1:2], off offset:23
-; CHECK-NEXT:    global_load_ubyte v12, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ubyte v13, v[1:2], off offset:21
-; CHECK-NEXT:    global_load_ubyte v14, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ubyte v15, v[1:2], off offset:19
-; CHECK-NEXT:    global_load_ubyte v16, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ubyte v17, v[1:2], off offset:17
-; CHECK-NEXT:    global_load_ubyte v18, v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ubyte v19, v[1:2], off offset:15
-; CHECK-NEXT:    global_load_ubyte v20, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ubyte v21, v[1:2], off offset:13
-; CHECK-NEXT:    global_load_ubyte v22, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ubyte v23, v[1:2], off offset:11
-; CHECK-NEXT:    global_load_ubyte v24, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ubyte v25, v[1:2], off offset:9
-; CHECK-NEXT:    global_load_ubyte v26, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ubyte v27, v[1:2], off offset:7
-; CHECK-NEXT:    global_load_ubyte v28, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ubyte v29, v[1:2], off offset:5
-; CHECK-NEXT:    global_load_ubyte v30, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ubyte v31, v[1:2], off offset:3
-; CHECK-NEXT:    global_load_ubyte v32, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ubyte v33, v[1:2], off offset:1
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(31)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v33, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 32, i1 false)
@@ -7775,31 +4414,12 @@ define void @memmove_p5_p4_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p4_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    global_load_ushort v3, v[1:2], off
-; CHECK-NEXT:    global_load_ushort v4, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ushort v5, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ushort v6, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ushort v7, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ushort v9, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ushort v1, v[1:2], off offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 16, i1 false)
@@ -7810,55 +4430,24 @@ define void @memmove_p5_p4_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p4_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ushort v4, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ushort v5, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ushort v6, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ushort v7, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ushort v9, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ushort v10, v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v11, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ushort v12, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ushort v13, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ushort v14, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ushort v15, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ushort v16, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ushort v17, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ushort v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    global_load_dwordx3 v[5:7], v[1:2], off offset:16
+; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:28
+; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:30
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -7869,55 +4458,19 @@ define void @memmove_p5_p4_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p4_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ushort v3, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ushort v4, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ushort v5, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ushort v6, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ushort v7, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ushort v9, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ushort v10, v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v11, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ushort v12, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ushort v13, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ushort v14, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ushort v15, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ushort v16, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ushort v17, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ushort v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 32, i1 false)
@@ -8062,55 +4615,19 @@ define void @memmove_p5_p5_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p5_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 16, i1 false)
@@ -8121,100 +4638,34 @@ define void @memmove_p5_p5_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p5_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:9
+; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:7
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:5
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -8225,103 +4676,31 @@ define void @memmove_p5_p5_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p5_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(31)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:7
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:5
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 32, i1 false)
@@ -8332,31 +4711,19 @@ define void @memmove_p5_p5_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p5_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v3, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v4, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v5, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v7, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v8, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v1, v1, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 16, i1 false)
@@ -8367,55 +4734,34 @@ define void @memmove_p5_p5_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p5_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ushort v3, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ushort v4, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ushort v5, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v7, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v8, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ushort v9, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v10, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v11, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v12, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v13, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v14, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v15, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v16, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v1, v1, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -8426,55 +4772,31 @@ define void @memmove_p5_p5_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p5_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ushort v3, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ushort v4, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ushort v5, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v7, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v8, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ushort v9, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v10, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v11, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v12, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v13, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v14, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v15, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v16, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v1, v1, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 32, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll b/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll
index 94bc6d46b2395b..8ad6a4e534d232 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll
@@ -19,8 +19,8 @@ define void @extracted_values(ptr %ret_struct, ptr addrspace(3) %arg0, ptr addrs
 ; CHECK-NEXT:    v_sub_f16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; CHECK-NEXT:    v_add_f16_e32 v4, v6, v7
 ; CHECK-NEXT:    v_add_f16_e32 v2, v3, v2
-; CHECK-NEXT:    flat_store_short v[0:1], v4
-; CHECK-NEXT:    flat_store_short v[0:1], v2 offset:2
+; CHECK-NEXT:    v_pack_b32_f16 v2, v4, v2
+; CHECK-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll b/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll
index 4e734d6e0884bc..fc33a274d7b11a 100644
--- a/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll
@@ -607,7 +607,14 @@ define amdgpu_kernel void @local_store_align1_v16i8(ptr addrspace(3) %out) #0 {
 ; MUBUF: buffer_load_ubyte
 ; MUBUF: buffer_load_ubyte
 ; MUBUF: buffer_load_ubyte
-; FLATSCR: scratch_load_dwordx2
+; FLATSCR: scratch_load_ubyte
+; FLATSCR: scratch_load_ubyte
+; FLATSCR: scratch_load_ubyte
+; FLATSCR: scratch_load_ubyte
+; FLATSCR: scratch_load_ubyte
+; FLATSCR: scratch_load_ubyte
+; FLATSCR: scratch_load_ubyte
+; FLATSCR: scratch_load_ubyte
 define double @private_load_align1_f64(ptr addrspace(5) %in) {
   %x = load double, ptr addrspace(5) %in, align 1
   ret double %x
@@ -622,7 +629,14 @@ define double @private_load_align1_f64(ptr addrspace(5) %in) {
 ; MUBUF: buffer_store_byte
 ; MUBUF: buffer_store_byte
 ; MUBUF: buffer_store_byte
-; FLATSCR: scratch_store_dwordx2
+; FLATSCR: scratch_store_byte
+; FLATSCR: scratch_store_byte
+; FLATSCR: scratch_store_byte
+; FLATSCR: scratch_store_byte
+; FLATSCR: scratch_store_byte
+; FLATSCR: scratch_store_byte
+; FLATSCR: scratch_store_byte
+; FLATSCR: scratch_store_byte
 define void @private_store_align1_f64(ptr addrspace(5) %out, double %x) #0 {
   store double %x, ptr addrspace(5) %out, align 1
   ret void
@@ -651,7 +665,10 @@ define void @private_store_align4_f64(ptr addrspace(5) %out, double %x) #0 {
 ; MUBUF: buffer_load_ushort
 ; MUBUF: buffer_load_ushort
 ; MUBUF: buffer_load_ushort
-; FLATSCR: scratch_load_dwordx2
+; FLATSCR: scratch_load_ushort
+; FLATSCR: scratch_load_ushort
+; FLATSCR: scratch_load_ushort
+; FLATSCR: scratch_load_ushort
 define double @private_load_align2_f64(ptr addrspace(5) %in) {
   %x = load double, ptr addrspace(5) %in, align 2
   ret double %x
@@ -662,7 +679,10 @@ define double @private_load_align2_f64(ptr addrspace(5) %in) {
 ; MUBUF: buffer_store_short
 ; MUBUF: buffer_store_short
 ; MUBUF: buffer_store_short
-; FLATSCR: scratch_store_dwordx2
+; FLATSCR: scratch_store_short
+; FLATSCR: scratch_store_short
+; FLATSCR: scratch_store_short
+; FLATSCR: scratch_store_short
 define void @private_store_align2_f64(ptr addrspace(5) %out, double %x) #0 {
   store double %x, ptr addrspace(5) %out, align 2
   ret void

From 65780f4d8e34461e6bd3baf2ff77496f97874b94 Mon Sep 17 00:00:00 2001
From: Dmitry Polukhin <34227995+dmpolukhin@users.noreply.github.com>
Date: Fri, 11 Oct 2024 08:23:35 +0100
Subject: [PATCH 134/177] [C++20][Modules] Allow import for a header unit after
 #pragma (#111662)

Summary:
`#pragma` and headers that finish with them shouldn't prevent `import
"header_unit.h"` syntax.

Test Plan: check-clang
---
 clang/lib/Lex/Preprocessor.cpp                 |  4 ++++
 .../import_header_unit_after_pragma.cpp        | 18 ++++++++++++++++++
 2 files changed, 22 insertions(+)
 create mode 100644 clang/test/Headers/import_header_unit_after_pragma.cpp

diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp
index f0b4593e0cc22e..ecc5166d7b814c 100644
--- a/clang/lib/Lex/Preprocessor.cpp
+++ b/clang/lib/Lex/Preprocessor.cpp
@@ -902,6 +902,10 @@ void Preprocessor::Lex(Token &Result) {
     case tok::r_brace:
       StdCXXImportSeqState.handleCloseBrace();
       break;
+#define PRAGMA_ANNOTATION(X) case tok::annot_##X:
+// For `#pragma ...` mimic ';'.
+#include "clang/Basic/TokenKinds.def"
+#undef PRAGMA_ANNOTATION
     // This token is injected to represent the translation of '#include "a.h"'
     // into "import a.h;". Mimic the notional ';'.
     case tok::annot_module_include:
diff --git a/clang/test/Headers/import_header_unit_after_pragma.cpp b/clang/test/Headers/import_header_unit_after_pragma.cpp
new file mode 100644
index 00000000000000..b1ad3b07fea29c
--- /dev/null
+++ b/clang/test/Headers/import_header_unit_after_pragma.cpp
@@ -0,0 +1,18 @@
+// RUN: rm -fR %t
+// RUN: split-file %s %t
+// RUN: cd %t
+// RUN: %clang_cc1 -verify -std=c++20 -emit-header-unit -xc++-user-header bz0.h
+// RUN: %clang_cc1 -verify -std=c++20 -emit-header-unit -xc++-user-header -fmodule-file=bz0.pcm bz.cpp
+
+//--- compare
+#pragma GCC visibility push(default)
+#pragma GCC visibility pop
+
+//--- bz0.h
+#include "compare"
+// expected-no-diagnostics
+
+//--- bz.cpp
+#include "compare"
+
+import "bz0.h"; // expected-warning {{the implementation of header units is in an experimental phase}}

From ff04bb8f4064274aedcb6e916079132ab6042a10 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Fri, 11 Oct 2024 09:31:49 +0200
Subject: [PATCH 135/177] [clang][bytecode] Use PredefinedExpr as base for its
 variable (#111956)

This fixes the error message generated.
---
 clang/lib/AST/ByteCode/Compiler.cpp |  5 +++++
 clang/lib/AST/ByteCode/Program.cpp  | 15 +++++++++------
 clang/lib/AST/ByteCode/Program.h    |  3 ++-
 clang/test/AST/ByteCode/cxx1z.cpp   |  4 ++++
 4 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index 0a3b38b0dc6e57..b2663714340b93 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -2869,6 +2869,11 @@ bool Compiler<Emitter>::VisitPredefinedExpr(const PredefinedExpr *E) {
   if (DiscardResult)
     return true;
 
+  if (!Initializing) {
+    unsigned StringIndex = P.createGlobalString(E->getFunctionName(), E);
+    return this->emitGetPtrGlobal(StringIndex, E);
+  }
+
   return this->delegate(E->getFunctionName());
 }
 
diff --git a/clang/lib/AST/ByteCode/Program.cpp b/clang/lib/AST/ByteCode/Program.cpp
index 23245a66b578ae..cd2665f755d7cb 100644
--- a/clang/lib/AST/ByteCode/Program.cpp
+++ b/clang/lib/AST/ByteCode/Program.cpp
@@ -33,7 +33,7 @@ const void *Program::getNativePointer(unsigned Idx) {
   return NativePointers[Idx];
 }
 
-unsigned Program::createGlobalString(const StringLiteral *S) {
+unsigned Program::createGlobalString(const StringLiteral *S, const Expr *Base) {
   const size_t CharWidth = S->getCharByteWidth();
   const size_t BitWidth = CharWidth * Ctx.getCharBit();
 
@@ -52,12 +52,15 @@ unsigned Program::createGlobalString(const StringLiteral *S) {
     llvm_unreachable("unsupported character width");
   }
 
+  if (!Base)
+    Base = S;
+
   // Create a descriptor for the string.
-  Descriptor *Desc =
-      allocateDescriptor(S, CharType, Descriptor::GlobalMD, S->getLength() + 1,
-                         /*isConst=*/true,
-                         /*isTemporary=*/false,
-                         /*isMutable=*/false);
+  Descriptor *Desc = allocateDescriptor(Base, CharType, Descriptor::GlobalMD,
+                                        S->getLength() + 1,
+                                        /*isConst=*/true,
+                                        /*isTemporary=*/false,
+                                        /*isMutable=*/false);
 
   // Allocate storage for the string.
   // The byte length does not include the null terminator.
diff --git a/clang/lib/AST/ByteCode/Program.h b/clang/lib/AST/ByteCode/Program.h
index be84c40714a60b..f676672fb7ced5 100644
--- a/clang/lib/AST/ByteCode/Program.h
+++ b/clang/lib/AST/ByteCode/Program.h
@@ -64,7 +64,8 @@ class Program final {
   const void *getNativePointer(unsigned Idx);
 
   /// Emits a string literal among global data.
-  unsigned createGlobalString(const StringLiteral *S);
+  unsigned createGlobalString(const StringLiteral *S,
+                              const Expr *Base = nullptr);
 
   /// Returns a pointer to a global.
   Pointer getPtrGlobal(unsigned Idx) const;
diff --git a/clang/test/AST/ByteCode/cxx1z.cpp b/clang/test/AST/ByteCode/cxx1z.cpp
index 1a06597fa348fe..57f99235a2b201 100644
--- a/clang/test/AST/ByteCode/cxx1z.cpp
+++ b/clang/test/AST/ByteCode/cxx1z.cpp
@@ -13,3 +13,7 @@ namespace Temp {
 
 char arr[3];
 A<const char*, &arr[1]> d; // both-error {{refers to subobject '&arr[1]'}}
+
+void Func() {
+  A<const char*, __func__> a; // both-error {{pointer to subobject of predefined '__func__' variable}}
+}

From bff2b8c06f362b6b4c761fc1d3951da2bddf17de Mon Sep 17 00:00:00 2001
From: Longsheng Mou <longshengmou@gmail.com>
Date: Fri, 11 Oct 2024 15:56:39 +0800
Subject: [PATCH 136/177] [mlir][sparse][test] Adjust tests for
 `LowerSparseOpsToForeach` (#110976)

This PR relocates the tests added in #109435 to a new file named
`no_lowering.mlir` and adds some new tests.
---
 mlir/test/Dialect/SparseTensor/codegen.mlir   | 16 ------
 .../Dialect/SparseTensor/no_lowering.mlir     | 54 +++++++++++++++++++
 2 files changed, 54 insertions(+), 16 deletions(-)
 create mode 100644 mlir/test/Dialect/SparseTensor/no_lowering.mlir

diff --git a/mlir/test/Dialect/SparseTensor/codegen.mlir b/mlir/test/Dialect/SparseTensor/codegen.mlir
index df03d871ba3a3e..af78458f109329 100644
--- a/mlir/test/Dialect/SparseTensor/codegen.mlir
+++ b/mlir/test/Dialect/SparseTensor/codegen.mlir
@@ -826,19 +826,3 @@ func.func @sparse_new_coo_permute_no(%arg0: !llvm.ptr) -> tensor<?x?xf32, #CooPN
   %0 = sparse_tensor.new %arg0 : !llvm.ptr to tensor<?x?xf32, #CooPNo>
   return %0 : tensor<?x?xf32, #CooPNo>
 }
-
-// CHECK-LABEL: func.func @test_tensor_dim_unranked
-//       CHECK: tensor.dim
-func.func @test_tensor_dim_unranked(%arg0: tensor<*xf32>) -> index {
-  %c = arith.constant 0 : index
-  %0 = tensor.dim %arg0, %c : tensor<*xf32>
-  return %0 : index
-}
-
-// CHECK-LABEL: func.func @test_tensor_reshape_unranked
-//       CHECK: tensor.reshape
-func.func @test_tensor_reshape_unranked(%src: tensor<*xf32>, %shape: tensor<1xi32>) -> tensor<?xf32> {
-  %dst = tensor.reshape %src(%shape)
-         : (tensor<*xf32>, tensor<1xi32>) -> tensor<?xf32>
-  return %dst : tensor<?xf32>
-}
diff --git a/mlir/test/Dialect/SparseTensor/no_lowering.mlir b/mlir/test/Dialect/SparseTensor/no_lowering.mlir
new file mode 100644
index 00000000000000..4f21055a13d58a
--- /dev/null
+++ b/mlir/test/Dialect/SparseTensor/no_lowering.mlir
@@ -0,0 +1,54 @@
+// RUN: mlir-opt %s --lower-sparse-ops-to-foreach --split-input-file | FileCheck %s
+
+// Ensure that we exit gracefully rather than crashing.
+
+// CHECK-LABEL: func.func @test_tensor_dim_unranked
+//       CHECK: tensor.dim
+func.func @test_tensor_dim_unranked(%arg0: tensor<*xf32>) -> index {
+  %c = arith.constant 0 : index
+  %0 = tensor.dim %arg0, %c : tensor<*xf32>
+  return %0 : index
+}
+
+// -----
+
+#SparseVector = #sparse_tensor.encoding<{
+  map = (d0) -> (d0 : compressed)
+}>
+
+// CHECK-LABEL: func.func @test_no_constant_dim
+//       CHECK: tensor.dim
+func.func @test_no_constant_dim(%arg0: tensor<?xf64, #SparseVector>, %arg1: index) -> index {
+  %0 = tensor.dim %arg0, %arg1 : tensor<?xf64, #SparseVector>
+  return %0 : index
+}
+
+// -----
+
+// CHECK-LABEL: func.func @test_tensor_dim_no_encoding
+//       CHECK: tensor.dim
+func.func @test_tensor_dim_no_encoding(%arg0: tensor<?xf32>) -> index {
+  %c = arith.constant 0 : index
+  %0 = tensor.dim %arg0, %c : tensor<?xf32>
+  return %0 : index
+}
+
+// -----
+
+// CHECK-LABEL: func.func @test_tensor_reshape_unranked
+//       CHECK: tensor.reshape
+func.func @test_tensor_reshape_unranked(%src: tensor<*xf32>, %shape: tensor<1xi32>) -> tensor<?xf32> {
+  %dst = tensor.reshape %src(%shape)
+         : (tensor<*xf32>, tensor<1xi32>) -> tensor<?xf32>
+  return %dst : tensor<?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func.func @test_tensor_reshape_no_encoding
+//       CHECK: tensor.reshape
+func.func @test_tensor_reshape_no_encoding(%src: tensor<?x?xf32>, %shape: tensor<1xi32>) -> tensor<?xf32> {
+  %dst = tensor.reshape %src(%shape)
+         : (tensor<?x?xf32>, tensor<1xi32>) -> tensor<?xf32>
+  return %dst : tensor<?xf32>
+}

From 8bb12ca28f7f195aa483fdb5921681ec373564ab Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Fri, 11 Oct 2024 11:17:09 +0300
Subject: [PATCH 137/177] [clang][NFC] Update `cxx_dr_status.html`

---
 clang/www/cxx_dr_status.html | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index 1a67b6103cf43e..6f3cc8247d2e2d 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -17128,11 +17128,11 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td>Undesired outcomes with <TT>const_cast</TT></td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2880">
+  <tr id="2880">
     <td><a href="https://cplusplus.github.io/CWG/issues/2880.html">2880</a></td>
-    <td>open</td>
+    <td>accepted</td>
     <td>Accessibility check for destructor of incomplete class type</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2881">
     <td><a href="https://cplusplus.github.io/CWG/issues/2881.html">2881</a></td>
@@ -17260,7 +17260,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr class="open" id="2901">
     <td><a href="https://cplusplus.github.io/CWG/issues/2901.html">2901</a></td>
-    <td>review</td>
+    <td>tentatively ready</td>
     <td>Unclear semantics for near-match aliased access</td>
     <td align="center">Not resolved</td>
   </tr>
@@ -17408,31 +17408,31 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr class="open" id="2923">
     <td><a href="https://cplusplus.github.io/CWG/issues/2923.html">2923</a></td>
-    <td>open</td>
+    <td>tentatively ready</td>
     <td>Note about infinite loops and execution steps</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2924">
     <td><a href="https://cplusplus.github.io/CWG/issues/2924.html">2924</a></td>
-    <td>open</td>
+    <td>review</td>
     <td>Undefined behavior during constant evaluation</td>
     <td align="center">Not resolved</td>
   </tr>
-  <tr class="open" id="2925">
+  <tr id="2925">
     <td><a href="https://cplusplus.github.io/CWG/issues/2925.html">2925</a></td>
-    <td>open</td>
+    <td>NAD</td>
     <td>Deleting a pointer to an incomplete enumeration type</td>
-    <td align="center">Not resolved</td>
+    <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2926">
     <td><a href="https://cplusplus.github.io/CWG/issues/2926.html">2926</a></td>
-    <td>open</td>
+    <td>tentatively ready</td>
     <td>Lookup context for dependent qualified names</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2927">
     <td><a href="https://cplusplus.github.io/CWG/issues/2927.html">2927</a></td>
-    <td>open</td>
+    <td>review</td>
     <td>Unclear status of translation unit with <TT>module</TT> keyword</td>
     <td align="center">Not resolved</td>
   </tr>

From bb4696ce3051be820de91c8c98b2649af1680236 Mon Sep 17 00:00:00 2001
From: Dmitriy Smirnov <dmitriy.smirnov@arm.com>
Date: Fri, 11 Oct 2024 09:39:19 +0100
Subject: [PATCH 138/177] [mlir][linalg] Fix for bias handling for Winograd
 (#110331)

PR makes winograd.output_transform op a destination style op and fixes
handing of a pre-existing data in its output argument (i.e. possibly
pre-initialized with bias, which was discarded before).

---------

Signed-off-by: Dmitriy Smirnov <dmitriy.smirnov@arm.com>
---
 .../mlir/Dialect/Linalg/IR/LinalgOps.td       |   3 +-
 .../Linalg/Transforms/WinogradConv2D.cpp      | 114 +++++++++---------
 .../transform-tile-and-winograd-rewrite.mlir  |  51 ++++----
 .../Linalg/transform-tile-winograd.mlir       |  26 ++--
 .../Linalg/winograd-conv2d-rewrite.mlir       |  17 +--
 5 files changed, 106 insertions(+), 105 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td
index 5b6a90f806bedd..e42fd5d2ce13c1 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td
@@ -313,7 +313,7 @@ def Linalg_WinogradInputTransformOp : Linalg_Op<"winograd_input_transform",
 }
 
 def Linalg_WinogradOutputTransformOp : Linalg_Op<"winograd_output_transform",
-    [AllElementTypesMatch<["value", "output"]>,
+    [AllElementTypesMatch<["value", "output"]>, DestinationStyleOpInterface,
      DeclareOpInterfaceMethods<TilingInterface,
       ["getIterationDomain",
        "getLoopIteratorTypes",
@@ -396,6 +396,7 @@ def Linalg_WinogradOutputTransformOp : Linalg_Op<"winograd_output_transform",
     int64_t getOutputFDim() {
       return 3;
     }
+    MutableOperandRange getDpsInitsMutable() { return getOutputMutable(); }
   }];
   let hasVerifier = 1;
 }
diff --git a/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp b/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp
index 80edf4a32c6df8..79f77822116fd7 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp
@@ -729,6 +729,7 @@ Value outputTransform(RewriterBase &rewriter, Location loc, Value value,
 
   auto buildBody = [&](OpBuilder &builder, Location loc, ValueRange ivs,
                        ValueRange args) -> scf::ValueVector {
+    auto context = builder.getContext();
     Value tileHIter = ivs[0];
     Value tileWIter = ivs[1];
     Value NIter = ivs[2];
@@ -740,29 +741,41 @@ Value outputTransform(RewriterBase &rewriter, Location loc, Value value,
                             FIter, 2, 3, /*loopNorFIdx=*/4,
                             /*loopCorFIdx=*/5, /*heightIdx=*/0, /*widthIdx=*/1);
 
-    TransformMapKeyTy key = {m, r};
-    int64_t retRows = 1;
-    int64_t retCols = 1;
-    int64_t leftScalarFactor = 1;
-    int64_t rightScalarFactor = 1;
+    const TransformMapKeyTy key = {m, r};
+    const TransformMatrix &AMatrix = AMatrices.at(key);
+    const TransformMatrix &ATMatrix = ATMatrices.at(key);
+    int64_t scalarFactor = (rightTransform ? AMatrix.scalarFactor : 1) *
+                           (leftTransform ? ATMatrix.scalarFactor : 1);
+    int64_t retCols = rightTransform ? AMatrix.cols : 1;
+    int64_t retRows = leftTransform ? ATMatrix.rows : 1;
+
     Value matmulRetValue = extractValue;
     Value zero = builder.create<arith::ConstantOp>(
         loc, rewriter.getZeroAttr(elementType));
-    if (leftTransform) {
-      // Get constant transform matrix AT.
-      auto it = ATMatrices.find(key);
-      if (it == ATMatrices.end())
-        return {};
-      const TransformMatrix &ATMatrix = it->second;
 
-      leftScalarFactor = ATMatrix.scalarFactor;
-      retRows = ATMatrix.rows;
+    auto affineMap =
+        AffineMap::get(1, 0, {builder.getAffineDimExpr(0) * m}, context);
+    Value heightOffset =
+        builder.create<affine::AffineApplyOp>(loc, affineMap, tileHIter);
+    Value widthOffset =
+        builder.create<affine::AffineApplyOp>(loc, affineMap, tileWIter);
+
+    Value outInitVal =
+        extract2DDataFrom4D(builder, loc, args[0], NIter, FIter, heightOffset,
+                            widthOffset, retRows, retCols,
+                            /*loopNorFIdx=*/0,
+                            /*loopCorFIdx=*/3, /*heightIdx=*/1,
+                            /*widthIdx=*/2);
+    if (leftTransform) {
       auto matmulType = RankedTensorType::get({retRows, valueW}, elementType);
-      auto empty =
-          builder
-              .create<tensor::EmptyOp>(loc, matmulType.getShape(), elementType)
-              .getResult();
-      auto init = builder.create<linalg::FillOp>(loc, zero, empty).getResult(0);
+      Value init = outInitVal;
+      if (rightTransform || scalarFactor != 1) {
+        auto empty = builder
+                         .create<tensor::EmptyOp>(loc, matmulType.getShape(),
+                                                  elementType)
+                         .getResult();
+        init = builder.create<linalg::FillOp>(loc, zero, empty).getResult(0);
+      }
 
       Value AT = create2DTransformMatrix(builder, loc, ATMatrix, elementType);
       // Multiply AT x m.
@@ -772,21 +785,16 @@ Value outputTransform(RewriterBase &rewriter, Location loc, Value value,
     }
 
     if (rightTransform) {
-      // Get constant transform matrix T.
-      auto it = AMatrices.find(key);
-      if (it == AMatrices.end())
-        return {};
-      const TransformMatrix &AMatrix = it->second;
-
-      rightScalarFactor = AMatrix.scalarFactor;
       auto matmulType =
           RankedTensorType::get({retRows, AMatrix.cols}, elementType);
-      retCols = AMatrix.cols;
-      auto empty =
-          builder
-              .create<tensor::EmptyOp>(loc, matmulType.getShape(), elementType)
-              .getResult();
-      auto init = builder.create<linalg::FillOp>(loc, zero, empty).getResult(0);
+      Value init = outInitVal;
+      if (scalarFactor != 1) {
+        auto empty = builder
+                         .create<tensor::EmptyOp>(loc, matmulType.getShape(),
+                                                  elementType)
+                         .getResult();
+        init = builder.create<linalg::FillOp>(loc, zero, empty).getResult(0);
+      }
 
       Value A = create2DTransformMatrix(builder, loc, AMatrix, elementType);
       // Multiply y = (AT x m) x A.
@@ -795,48 +803,36 @@ Value outputTransform(RewriterBase &rewriter, Location loc, Value value,
       matmulRetValue = matmulOp.getResult(0);
     }
 
-    if (leftScalarFactor * rightScalarFactor != 1) {
-      // Multiply scalar factor.
-      Value scalarFactor = builder.create<arith::ConstantOp>(
-          loc,
-          FloatAttr::get(elementType, leftScalarFactor * rightScalarFactor));
+    if (scalarFactor != 1) {
+      // Multiply by scalar factor and add outInitVal.
+      Value scalarFactorValue = builder.create<arith::ConstantOp>(
+          loc, FloatAttr::get(elementType, scalarFactor));
       auto matmulType = RankedTensorType::get({retRows, retCols}, elementType);
-      auto init = builder.create<tensor::EmptyOp>(loc, matmulType.getShape(),
-                                                  elementType);
-
       auto identityAffineMap = rewriter.getMultiDimIdentityMap(2);
       SmallVector<AffineMap> affineMaps = {
-          AffineMap::get(2, 0, init.getContext()), identityAffineMap};
-      auto broadcastedScalar =
+          AffineMap::get(2, 0, context), identityAffineMap, identityAffineMap};
+
+      matmulRetValue =
           rewriter
               .create<linalg::GenericOp>(
-                  loc, matmulType, ValueRange{scalarFactor}, ValueRange{init},
-                  affineMaps,
+                  loc, matmulType,
+                  ValueRange{scalarFactorValue, matmulRetValue},
+                  ValueRange{outInitVal}, affineMaps,
                   llvm::ArrayRef<utils::IteratorType>{
                       utils::IteratorType::parallel,
                       utils::IteratorType::parallel},
                   [&](OpBuilder &nestedBuilder, Location nestedLoc,
                       ValueRange args) {
-                    nestedBuilder.create<linalg::YieldOp>(nestedLoc, args[0]);
+                    auto mulf = nestedBuilder.create<arith::MulFOp>(
+                        nestedLoc, args[0], args[1]);
+                    auto addf = nestedBuilder.create<arith::AddFOp>(
+                        nestedLoc, mulf.getResult(), args[2]);
+                    nestedBuilder.create<linalg::YieldOp>(nestedLoc,
+                                                          addf.getResult());
                   })
               .getResult(0);
-
-      matmulRetValue = builder
-                           .create<linalg::MulOp>(
-                               loc, matmulType,
-                               ValueRange{broadcastedScalar, matmulRetValue},
-                               ValueRange{init})
-                           .getResult(0);
     }
 
-    auto context = builder.getContext();
-    auto affineMap =
-        AffineMap::get(1, 0, {builder.getAffineDimExpr(0) * m}, context);
-    Value heightOffset =
-        builder.create<affine::AffineApplyOp>(loc, affineMap, tileHIter);
-    Value widthOffset =
-        builder.create<affine::AffineApplyOp>(loc, affineMap, tileWIter);
-
     // Insert (H, W) to (N, H, W, F).
     Value combinedVal =
         insert2DDataTo4D(builder, loc, matmulRetValue, args[0], NIter, FIter,
diff --git a/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir b/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir
index c5760acf94a88a..776dc5b748c846 100644
--- a/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir
+++ b/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir
@@ -85,31 +85,32 @@ module attributes {transform.with_named_sequence} {
 // CHECK:    scf.yield %[[S9]]
 // CHECK:  %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S1]] {{\[}}[0, 1], [2], [3]]
 // CHECK:  %[[COLLAPSED_6:.*]] = tensor.collapse_shape %[[S4]] {{\[}}[0, 1], [2, 3, 4], [5]]
+// CHECK:  %[[S7:.*]] = tensor.empty()
 // CHECK:  %[[S6:.*]] = linalg.batch_matmul
 // CHECK:  %[[EXPANDED:.*]] = tensor.expand_shape %[[S6]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 2, 2, 2, 2]
-// CHECK:  %[[S7:.*]] = tensor.empty() : tensor<2x8x8x2xf32>
-// CHECK:  %[[S8:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S7]])
+// CHECK:  %[[S8:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[ARG2]])
 // CHECK:    %[[S9:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]])
 // CHECK:      %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[EXPANDED]][0, 0, %[[ARG3]], %[[ARG5]], 0, 0] [6, 6, 1, 1, 2, 2] [1, 1, 1, 1, 1, 1]
 // CHECK:      %[[S10:.*]] = affine.apply #[[$MAP0]](%[[ARG3]])
 // CHECK:      %[[S11:.*]] = affine.apply #[[$MAP0]](%[[ARG5]])
-// CHECK:      %[[EXTRACTED_SLICE_7:.*]] = tensor.extract_slice %[[ARG2]][0, %[[S10]], %[[S11]], 0] [2, 4, 4, 2] [1, 1, 1, 1]
+// CHECK:      %[[EXTRACTED_SLICE_7:.*]] = tensor.extract_slice %[[ARG6]][0, %[[S10]], %[[S11]], 0] [2, 4, 4, 2] [1, 1, 1, 1]
 // CHECK:      %[[S12:.*]] = scf.for %[[ARG7:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG8:.*]] = %[[EXTRACTED_SLICE_7]])
 // CHECK:        %[[S15:.*]] = scf.for %[[ARG9:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG10:.*]] = %[[ARG8]])
 // CHECK:          %[[EXTRACTED_SLICE_8:.*]] = tensor.extract_slice %[[EXTRACTED_SLICE]][0, 0, 0, 0, %[[ARG7]], %[[ARG9]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1]
+// CHECK:          %[[S25:.*]] = tensor.extract_slice %[[ARG10]][%[[ARG7]], 0, 0, %[[ARG9]]] [1, 4, 4, 1] [1, 1, 1, 1]
 // CHECK:          %[[S16:.*]] = tensor.empty() : tensor<4x6xf32>
 // CHECK:          %[[S17:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S16]] : tensor<4x6xf32>) -> tensor<4x6xf32>
 // CHECK:          %[[S18:.*]] = linalg.matmul ins(%[[CST_1]], %[[EXTRACTED_SLICE_8]] : tensor<4x6xf32>, tensor<6x6xf32>) outs(%[[S17]] : tensor<4x6xf32>) -> tensor<4x6xf32>
 // CHECK:          %[[S19:.*]] = tensor.empty() : tensor<4x4xf32>
 // CHECK:          %[[S20:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S19]] : tensor<4x4xf32>) -> tensor<4x4xf32>
 // CHECK:          %[[S21:.*]] = linalg.matmul ins(%[[S18]], %[[CST_0]] : tensor<4x6xf32>, tensor<6x4xf32>) outs(%[[S20]] : tensor<4x4xf32>) -> tensor<4x4xf32>
-// CHECK:          %[[S22:.*]] = tensor.empty() : tensor<4x4xf32>
-// CHECK:          %[[S23:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S22]] : tensor<4x4xf32>) {
-// CHECK:          ^bb0(%[[IN:.*]]: f32, %[[OUT:.*]]: f32):
-// CHECK:            linalg.yield %[[IN]] : f32
+// CHECK:          %[[S23:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]], %[[S21]] : f32, tensor<4x4xf32>) outs(%[[S25]] : tensor<4x4xf32>) {
+// CHECK:          ^bb0(%[[IN1:.*]]: f32, %[[IN2:.*]]: f32, %[[OUT:.*]]: f32):
+// CHECK:             %[[VAL_90:.*]] = arith.mulf %[[IN1]], %[[IN2]] : f32
+// CHECK:             %[[VAL_91:.*]] = arith.addf %[[VAL_90]], %[[OUT]] : f32
+/// CHECK:            linalg.yield %[[VAL_91]] : f32
 // CHECK:          } -> tensor<4x4xf32>
-// CHECK:          %[[S24:.*]] = linalg.mul ins(%[[S23]], %[[S21]] : tensor<4x4xf32>, tensor<4x4xf32>) outs(%[[S22]] : tensor<4x4xf32>) -> tensor<4x4xf32>
-// CHECK:          %[[INSERTED_SLICE_9:.*]] = tensor.insert_slice %[[S24]] into %[[ARG10]][%[[ARG7]], 0, 0, %[[ARG9]]] [1, 4, 4, 1] [1, 1, 1, 1]
+// CHECK:          %[[INSERTED_SLICE_9:.*]] = tensor.insert_slice %[[S23]] into %[[ARG10]][%[[ARG7]], 0, 0, %[[ARG9]]] [1, 4, 4, 1] [1, 1, 1, 1]
 // CHECK:          scf.yield %[[INSERTED_SLICE_9]]
 // CHECK:        scf.yield %[[S15]]
 // CHECK:      %[[S13:.*]] = affine.apply #[[$MAP0]](%[[ARG3]])
@@ -218,32 +219,33 @@ module attributes {transform.with_named_sequence} {
 // CHECK:    scf.yield %[[S9]]
 // CHECK:  %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S1]] {{\[}}[0, 1], [2], [3]]
 // CHECK:  %[[COLLAPSED_7:.*]] = tensor.collapse_shape %[[S4]] {{\[}}[0, 1], [2, 3, 4], [5]]
+// CHECK:  %[[S7:.*]] = tensor.empty()
 // CHECK:  %[[S6:.*]] = linalg.batch_matmul
 // CHECK:  %[[EXPANDED:.*]] = tensor.expand_shape %[[S6]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 3, 3, 2, 2]
 // CHECK:  %[[PADDED_8:.*]] = tensor.pad %[[ARG2]] low[0, 0, 0, 0] high[0, 3, 3, 0]
-// CHECK:  %[[S7:.*]] = tensor.empty() : tensor<2x12x12x2xf32>
-// CHECK:  %[[S8:.*]] = scf.for %[[ARG4:.*]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[ARG5:.*]] = %[[S7]])
+// CHECK:  %[[S8:.*]] = scf.for %[[ARG4:.*]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[ARG5:.*]] = %[[PADDED_8]])
 // CHECK:    %[[S9:.*]] = scf.for %[[ARG6:.*]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[ARG7:.*]] = %[[ARG5]])
 // CHECK:      %[[EXTRACTED_SLICE_9:.*]] = tensor.extract_slice %[[EXPANDED]][0, 0, %[[ARG4]], %[[ARG6]], 0, 0] [6, 6, 1, 1, 2, 2] [1, 1, 1, 1, 1, 1]
 // CHECK:      %[[S10:.*]] = affine.apply #[[$MAP0]](%[[ARG4]])
 // CHECK:      %[[S11:.*]] = affine.apply #[[$MAP0]](%[[ARG6]])
-// CHECK:      %[[EXTRACTED_SLICE_10:.*]] = tensor.extract_slice %[[PADDED_8]][0, %[[S10]], %[[S11]], 0] [2, 4, 4, 2] [1, 1, 1, 1]
+// CHECK:      %[[EXTRACTED_SLICE_10:.*]] = tensor.extract_slice %[[ARG7]][0, %[[S10]], %[[S11]], 0] [2, 4, 4, 2] [1, 1, 1, 1]
 // CHECK:      %[[S12:.*]] = scf.for %[[ARG8:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG9:.*]] = %[[EXTRACTED_SLICE_10]])
 // CHECK:        %[[S15:.*]] = scf.for %[[ARG10:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG11:.*]] = %[[ARG9]])
 // CHECK:          %[[EXTRACTED_SLICE_11:.*]] = tensor.extract_slice %[[EXTRACTED_SLICE_9]][0, 0, 0, 0, %[[ARG8]], %[[ARG10]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1]
+// CHECK:          %[[S26:.*]] = tensor.extract_slice %[[ARG11]][%[[ARG8]], 0, 0, %[[ARG10]]] [1, 4, 4, 1] [1, 1, 1, 1]
 // CHECK:          %[[S17:.*]] = tensor.empty() : tensor<4x6xf32>
 // CHECK:          %[[S18:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S17]] : tensor<4x6xf32>) -> tensor<4x6xf32>
 // CHECK:          %[[S19:.*]] = linalg.matmul ins(%[[CST_1]], %[[EXTRACTED_SLICE_11]] : tensor<4x6xf32>, tensor<6x6xf32>) outs(%[[S18]] : tensor<4x6xf32>) -> tensor<4x6xf32>
 // CHECK:          %[[S20:.*]] = tensor.empty() : tensor<4x4xf32>
 // CHECK:          %[[S21:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S20]] : tensor<4x4xf32>) -> tensor<4x4xf32>
 // CHECK:          %[[S22:.*]] = linalg.matmul ins(%[[S19]], %[[CST_0]] : tensor<4x6xf32>, tensor<6x4xf32>) outs(%[[S21]] : tensor<4x4xf32>) -> tensor<4x4xf32>
-// CHECK:          %[[S23:.*]] = tensor.empty() : tensor<4x4xf32>
-// CHECK:          %[[S24:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S23]] : tensor<4x4xf32>) {
-// CHECK:          ^bb0(%[[IN:.*]]: f32, %[[OUT:.*]]: f32):
-// CHECK:            linalg.yield %[[IN]] : f32
+// CHECK:          %[[S24:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]], %[[S22]] : f32, tensor<4x4xf32>) outs(%[[S26]] : tensor<4x4xf32>) {
+// CHECK:          ^bb0(%[[IN1:.*]]: f32, %[[IN2:.*]]: f32, %[[OUT:.*]]: f32):
+// CHECK:             %[[VAL_104:.*]] = arith.mulf %[[IN1]], %[[IN2]] : f32
+// CHECK:             %[[VAL_105:.*]] = arith.addf %[[VAL_104]], %[[OUT]] : f32
+/// CHECK:            linalg.yield %[[VAL_105]] : f32
 // CHECK:          } -> tensor<4x4xf32>
-// CHECK:          %[[S25:.*]] = linalg.mul ins(%[[S24]], %[[S22]] : tensor<4x4xf32>, tensor<4x4xf32>) outs(%[[S23]] : tensor<4x4xf32>) -> tensor<4x4xf32>
-// CHECK:          %[[INSERTED_SLICE_12:.*]] = tensor.insert_slice %[[S25]] into %[[ARG11]][%[[ARG8]], 0, 0, %[[ARG10]]] [1, 4, 4, 1] [1, 1, 1, 1]
+// CHECK:          %[[INSERTED_SLICE_12:.*]] = tensor.insert_slice %[[S24]] into %[[ARG11]][%[[ARG8]], 0, 0, %[[ARG10]]] [1, 4, 4, 1] [1, 1, 1, 1]
 // CHECK:          scf.yield %[[INSERTED_SLICE_12]]
 // CHECK:        scf.yield %[[S15]] : tensor<2x4x4x2xf32>
 // CHECK:      %[[S13:.*]] = affine.apply #[[$MAP0]](%[[ARG4]])
@@ -330,16 +332,17 @@ module attributes {transform.with_named_sequence} {
 // CHECK:   %[[S6:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[ARG2]])
 // CHECK:     %[[S7:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]])
 // CHECK:       %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[EXPANDED]][0, 0, 0, 0, %[[ARG3]], %[[ARG5]]] [6, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1]
+// CHECK:       %[[S15:.*]] = tensor.extract_slice %[[ARG6]][%[[ARG3]], 0, 0, %[[ARG5]]] [1, 4, 1, 1] [1, 1, 1, 1]
 // CHECK:       %[[S9:.*]] = tensor.empty() : tensor<4x1xf32>
 // CHECK:       %[[S10:.*]] = linalg.fill ins(%[[CST_3]] : f32) outs(%[[S9]] : tensor<4x1xf32>) -> tensor<4x1xf32>
 // CHECK:       %[[S11:.*]] = linalg.matmul ins(%[[CST_0]], %[[EXTRACTED_SLICE]] : tensor<4x6xf32>, tensor<6x1xf32>) outs(%[[S10]] : tensor<4x1xf32>) -> tensor<4x1xf32>
-// CHECK:       %[[S12:.*]] = tensor.empty() : tensor<4x1xf32>
-// CHECK:       %[[S13:.*]] = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S12]] : tensor<4x1xf32>) {
-// CHECK:       ^bb0(%[[IN:.*]]: f32, %[[OUT:.*]]: f32):
-// CHECK:         linalg.yield %[[IN]] : f32
+// CHECK:       %[[S13:.*]] = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%[[CST]], %[[S11]] : f32, tensor<4x1xf32>) outs(%[[S15]] : tensor<4x1xf32>) {
+// CHECK:       ^bb0(%[[IN1:.*]]: f32, %[[IN2:.*]]: f32, %[[OUT:.*]]: f32):
+// CHECK:          %[[VAL_57:.*]] = arith.mulf %[[IN1]], %[[IN2]] : f32
+// CHECK:          %[[VAL_58:.*]] = arith.addf %[[VAL_57]], %[[OUT]] : f32
+/// CHECK:         linalg.yield %[[VAL_58]] : f32
 // CHECK:       } -> tensor<4x1xf32>
-// CHECK:       %[[S14:.*]] = linalg.mul ins(%[[S13]], %[[S11]] : tensor<4x1xf32>, tensor<4x1xf32>) outs(%[[S12]] : tensor<4x1xf32>) -> tensor<4x1xf32>
-// CHECK:       %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S14]] into %[[ARG6]][%[[ARG3]], 0, 0, %[[ARG5]]] [1, 4, 1, 1] [1, 1, 1, 1]
+// CHECK:       %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S13]] into %[[ARG6]][%[[ARG3]], 0, 0, %[[ARG5]]] [1, 4, 1, 1] [1, 1, 1, 1]
 // CHECK:       scf.yield %[[INSERTED_SLICE]]
 // CHECK:     scf.yield %[[S7]]
 // CHECK:   return %[[S6]]
diff --git a/mlir/test/Dialect/Linalg/transform-tile-winograd.mlir b/mlir/test/Dialect/Linalg/transform-tile-winograd.mlir
index 21522a2083b463..9598c434aadb8f 100644
--- a/mlir/test/Dialect/Linalg/transform-tile-winograd.mlir
+++ b/mlir/test/Dialect/Linalg/transform-tile-winograd.mlir
@@ -279,14 +279,14 @@ module attributes {transform.with_named_sequence} {
 // CHECK-DAG:   %[[C2_1:.*]] = arith.constant 2 : index
 // CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
 // CHECK-DAG:   %[[C1_2:.*]] = arith.constant 1 : index
-// CHECK:   %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]]
-// CHECK:     %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C2_1]] step %[[C1_2]]
+// CHECK:   %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG5:.*]] = %[[ARG1]]) -> (tensor<2x8x8x2xf32>)
+// CHECK:     %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C2_1]] step %[[C1_2]] iter_args(%[[ARG6:.*]] = %[[ARG5]]) -> (tensor<2x8x8x2xf32>)
 // CHECK:       %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][0, 0, %[[ARG2]], %[[ARG4]], 0, 0] [6, 6, 1, 1, 2, 2] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x2x2xf32> to tensor<6x6x1x1x2x2xf32>
 // CHECK:       %[[S3:.*]] = affine.apply #[[$MAP0]](%[[ARG2]])
 // CHECK:       %[[S4:.*]] = affine.apply #[[$MAP0]](%[[ARG4]])
 // CHECK:       %[[S5:.*]] = affine.apply #[[$MAP1]]()
 // CHECK:       %[[S6:.*]] = affine.apply #[[$MAP1]]()
-// CHECK:       %[[EXTRACTED_SLICE_5:.*]] = tensor.extract_slice %[[ARG1]][0, %[[S3]], %[[S4]], 0] [2, %[[S5]], %[[S6]], 2] [1, 1, 1, 1] : tensor<2x8x8x2xf32> to tensor<2x?x?x2xf32>
+// CHECK:       %[[EXTRACTED_SLICE_5:.*]] = tensor.extract_slice %[[ARG6]][0, %[[S3]], %[[S4]], 0] [2, %[[S5]], %[[S6]], 2] [1, 1, 1, 1] : tensor<2x8x8x2xf32> to tensor<2x?x?x2xf32>
 
 // -----
 
@@ -321,10 +321,10 @@ module attributes {transform.with_named_sequence} {
 // CHECK-DAG:    %[[C2_3:.*]] = arith.constant 2 : index
 // CHECK-DAG:    %[[C2_5:.*]] = arith.constant 2 : index
 // CHECK-DAG:    %[[C2_7:.*]] = arith.constant 2 : index
-// CHECK:    %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C2_0]]
-// CHECK:      %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_1]] to %[[C2_2]] step %[[C2_3]]
-// CHECK:        %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_4]] to %[[C3]] step %[[C2_5]]
-// CHECK:          %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_6]] to %[[C5]] step %[[C2_7]]
+// CHECK:    %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C2_0]] iter_args(%[[ARG9:.*]] = %[[ARG1]]) -> (tensor<3x8x8x5xf32>)
+// CHECK:      %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_1]] to %[[C2_2]] step %[[C2_3]] iter_args(%[[ARG10:.*]] = %[[ARG9]]) -> (tensor<3x8x8x5xf32>)
+// CHECK:        %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_4]] to %[[C3]] step %[[C2_5]] iter_args(%[[ARG11:.*]] = %[[ARG10]])
+// CHECK:          %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_6]] to %[[C5]] step %[[C2_7]] iter_args(%[[ARG12:.*]] = %[[ARG11]])
 // CHECK:            %[[C3_8:.*]] = arith.constant 3 : index
 // CHECK:            %[[S5:.*]] = affine.min #[[$MAP0]](%[[ARG6]])
 // CHECK:            %[[C5_9:.*]] = arith.constant 5 : index
@@ -334,7 +334,7 @@ module attributes {transform.with_named_sequence} {
 // CHECK:            %[[S8:.*]] = affine.apply #[[$MAP2]](%[[ARG4]])
 // CHECK:            %[[S9:.*]] = affine.apply #[[$MAP3]]()
 // CHECK:            %[[S10:.*]] = affine.apply #[[$MAP3]]()
-// CHECK:            %[[EXTRACTED_SLICE_12:.*]] = tensor.extract_slice %[[ARG1]][%[[ARG6]], %[[S7]], %[[S8]], %[[ARG8]]] [%[[S5]], %[[S9]], %[[S10]], %[[S6]]] [1, 1, 1, 1] : tensor<3x8x8x5xf32> to tensor<?x?x?x?xf32>
+// CHECK:            %[[EXTRACTED_SLICE_12:.*]] = tensor.extract_slice %[[ARG12]][%[[ARG6]], %[[S7]], %[[S8]], %[[ARG8]]] [%[[S5]], %[[S9]], %[[S10]], %[[S6]]] [1, 1, 1, 1] : tensor<3x8x8x5xf32> to tensor<?x?x?x?xf32>
 
 // -----
 
@@ -367,14 +367,14 @@ module attributes {transform.with_named_sequence} {
 // CHECK-DAG:   %[[C1_2:.*]] = arith.constant 1 : index
 // CHECK-DAG:   %[[C1_4:.*]] = arith.constant 1 : index
 // CHECK-DAG:   %[[C1_6:.*]] = arith.constant 1 : index
-// CHECK:   %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]]
-// CHECK:     %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C1_1]] step %[[C1_2]]
-// CHECK:       %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_3]] to %[[C3]] step %[[C1_4]]
-// CHECK:         %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_5]] to %[[C5]] step %[[C1_6]]
+// CHECK:   %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG9:.*]] = %[[ARG1]]) -> (tensor<3x8x1x5xf32>)
+// CHECK:     %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C1_1]] step %[[C1_2]] iter_args(%[[ARG10:.*]] = %[[ARG9]]) -> (tensor<3x8x1x5xf32>)
+// CHECK:       %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_3]] to %[[C3]] step %[[C1_4]] iter_args(%[[ARG11:.*]] = %[[ARG10]]) -> (tensor<3x8x1x5xf32>)
+// CHECK:         %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_5]] to %[[C5]] step %[[C1_6]] iter_args(%[[ARG12:.*]] = %[[ARG11]]) -> (tensor<3x8x1x5xf32>)
 // CHECK:           %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][0, 0, %[[ARG2]], %[[ARG4]], %[[ARG6]], %[[ARG8]]] [6, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x1x2x1x3x5xf32> to tensor<6x1x1x1x1x1xf32>
 // CHECK:           %[[S5:.*]] = affine.apply #[[$MAP0]](%[[ARG2]])
 // CHECK:           %[[S6:.*]] = affine.apply #[[$MAP0]](%[[ARG4]])
 // CHECK:           %[[S7:.*]] = affine.apply #[[$MAP1]]()
 // CHECK:           %[[S8:.*]] = affine.apply #[[$MAP1]]()
-// CHECK:           %[[EXTRACTED_SLICE_9:.*]] = tensor.extract_slice %[[ARG1]][%[[ARG6]], %[[S5]], 0, %[[ARG8]]] [1, %[[S7]], 1, 1] [1, 1, 1, 1] : tensor<3x8x1x5xf32> to tensor<1x?x1x1xf32>
+// CHECK:           %[[EXTRACTED_SLICE_9:.*]] = tensor.extract_slice %[[ARG12]][%[[ARG6]], %[[S5]], 0, %[[ARG8]]] [1, %[[S7]], 1, 1] [1, 1, 1, 1] : tensor<3x8x1x5xf32> to tensor<1x?x1x1xf32>
 // CHECK:           %[[S9:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<6x1x1x1x1x1xf32>) outs(%[[EXTRACTED_SLICE_9]] : tensor<1x?x1x1xf32>) -> tensor<1x?x1x1xf32>
diff --git a/mlir/test/Dialect/Linalg/winograd-conv2d-rewrite.mlir b/mlir/test/Dialect/Linalg/winograd-conv2d-rewrite.mlir
index 4369f5f1eab4ca..16d06a74732729 100644
--- a/mlir/test/Dialect/Linalg/winograd-conv2d-rewrite.mlir
+++ b/mlir/test/Dialect/Linalg/winograd-conv2d-rewrite.mlir
@@ -100,21 +100,22 @@ func.func @conv2d(%arg0: tensor<2x11x11x5xf32>, %arg1: tensor<2x3x3x5xf32>, %arg
 // CHECK-NEXT:       %[[S8:.*]] = scf.for %[[ARG7:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG8:.*]] = %[[ARG6]]) -> (tensor<2x12x12x2xf32>) {
 // CHECK-NEXT:         %[[S9:.*]] = scf.for %[[ARG9:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG10:.*]] = %[[ARG8]]) -> (tensor<2x12x12x2xf32>) {
 // CHECK-NEXT:           %[[EXTRACTED_SLICE_9:.*]] = tensor.extract_slice %[[EXPANDED]][0, 0, %[[ARG3]], %[[ARG5]], %[[ARG7]], %[[ARG9]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x6x3x3x2x2xf32> to tensor<6x6xf32>
+// CHECK-NEXT:           %[[S20:.*]] = affine.apply #[[$MAP0]](%[[ARG3]])
+// CHECK-NEXT:           %[[S21:.*]] = affine.apply #[[$MAP0]](%[[ARG5]])
+// CHECK-NEXT:           %[[S22:.*]] = tensor.extract_slice %[[ARG10]][%[[ARG7]], %[[S20]], %[[S21]], %[[ARG9]]] [1, 4, 4, 1] [1, 1, 1, 1] : tensor<2x12x12x2xf32> to tensor<4x4xf32>
 // CHECK-NEXT:           %[[S11:.*]] = tensor.empty() : tensor<4x6xf32>
 // CHECK-NEXT:           %[[S12:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S11]] : tensor<4x6xf32>) -> tensor<4x6xf32>
 // CHECK-NEXT:           %[[S13:.*]] = linalg.matmul ins(%[[CST_1]], %[[EXTRACTED_SLICE_9]] : tensor<4x6xf32>, tensor<6x6xf32>) outs(%[[S12]] : tensor<4x6xf32>) -> tensor<4x6xf32>
 // CHECK-NEXT:           %[[S14:.*]] = tensor.empty() : tensor<4x4xf32>
 // CHECK-NEXT:           %[[S15:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S14]] : tensor<4x4xf32>) -> tensor<4x4xf32>
 // CHECK-NEXT:           %[[S16:.*]] = linalg.matmul ins(%[[S13]], %[[CST_0]] : tensor<4x6xf32>, tensor<6x4xf32>) outs(%[[S15]] : tensor<4x4xf32>) -> tensor<4x4xf32>
-// CHECK-NEXT:           %[[S17:.*]] = tensor.empty() : tensor<4x4xf32>
-// CHECK-NEXT:           %[[S18:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S17]] : tensor<4x4xf32>) {
-// CHECK-NEXT:           ^bb0(%[[IN:.*]]: f32, %[[OUT:.*]]: f32):
-// CHECK-NEXT:             linalg.yield %[[IN]] : f32
+// CHECK-NEXT:           %[[S18:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]], %[[S16]] : f32, tensor<4x4xf32>) outs(%[[S22]] : tensor<4x4xf32>) {
+// CHECK-NEXT:           ^bb0(%[[IN1:.*]]: f32, %[[IN2:.*]]: f32, %[[OUT:.*]]: f32):
+// CHECK-NEXT:             %[[VAL_98:.*]] = arith.mulf %[[IN1]], %[[IN2]] : f32
+// CHECK-NEXT:             %[[VAL_99:.*]] = arith.addf %[[VAL_98]], %[[OUT]] : f32
+// CHECK-NEXT:             linalg.yield %[[VAL_99]] : f32
 // CHECK-NEXT:           } -> tensor<4x4xf32>
-// CHECK-NEXT:           %[[S19:.*]] = linalg.mul ins(%[[S18]], %[[S16]] : tensor<4x4xf32>, tensor<4x4xf32>) outs(%[[S17]] : tensor<4x4xf32>) -> tensor<4x4xf32>
-// CHECK-NEXT:           %[[S20:.*]] = affine.apply #[[$MAP0]](%[[ARG3]])
-// CHECK-NEXT:           %[[S21:.*]] = affine.apply #[[$MAP0]](%[[ARG5]])
-// CHECK-NEXT:           %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S19]] into %[[ARG10]][%[[ARG7]], %[[S20]], %[[S21]], %[[ARG9]]] [1, 4, 4, 1] [1, 1, 1, 1] : tensor<4x4xf32> into tensor<2x12x12x2xf32>
+// CHECK-NEXT:           %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S18]] into %[[ARG10]][%[[ARG7]], %[[S20]], %[[S21]], %[[ARG9]]] [1, 4, 4, 1] [1, 1, 1, 1] : tensor<4x4xf32> into tensor<2x12x12x2xf32>
 // CHECK-NEXT:           scf.yield %[[INSERTED_SLICE]] : tensor<2x12x12x2xf32>
 // CHECK-NEXT:         }
 // CHECK-NEXT:         scf.yield %[[S9]] : tensor<2x12x12x2xf32>

From ebeb56af5f8f1ff9da8f5a7e98348f460d223de1 Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Fri, 11 Oct 2024 10:40:28 +0200
Subject: [PATCH 139/177] [lldb] Only send "posix" error codes through the
 gdb-remote protocol (#108170)

The other side has no way of telling which namespace do these codes
belong to, so mashing them all together is not very helpful.

I'm mainly doing this to simplify some code in a pending patch
<https://github.com/llvm/llvm-project/pull/106774/files#r1752628604>,
and I've picked the posix error category semi-randomly. If we wanted to
be serious about assigning meaning to these error codes, we should
create a special error category for "gdb errors".

From b222f319306a9cad9ac11183b7036ff45097c26f Mon Sep 17 00:00:00 2001
From: Dmitry Vasilyev <dvassiliev@accesssoftek.com>
Date: Fri, 11 Oct 2024 12:56:42 +0400
Subject: [PATCH 140/177] [lldb][test] Fixed the test
 `no_unique_address-with-bitfields` (#111902)

Fixed the error `unable to create target: 'No available targets are
compatible with triple "x86_64-apple-macosx10.4.0"'` running `clang
--target=x86_64-apple-macosx -c -gdwarf -o %t %s`.
---
 .../DWARF/{ => x86}/no_unique_address-with-bitfields.cpp          | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename lldb/test/Shell/SymbolFile/DWARF/{ => x86}/no_unique_address-with-bitfields.cpp (100%)

diff --git a/lldb/test/Shell/SymbolFile/DWARF/no_unique_address-with-bitfields.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-with-bitfields.cpp
similarity index 100%
rename from lldb/test/Shell/SymbolFile/DWARF/no_unique_address-with-bitfields.cpp
rename to lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-with-bitfields.cpp

From 72f339de45bb590f25571c4c447a725e6f1dd8d7 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Fri, 11 Oct 2024 10:10:15 +0100
Subject: [PATCH 141/177] [LoopVectorize] Use predicated version of
 getSmallConstantMaxTripCount (#109928)

There are a number of places where we call getSmallConstantMaxTripCount
without passing a vector of predicates:

getSmallBestKnownTC
isIndvarOverflowCheckKnownFalse
computeMaxVF
isMoreProfitable

I've changed all of these to now pass in a predicate vector so that
we get the benefit of making better vectorisation choices when we
know the max trip count for loops that require SCEV predicate checks.

I've tried to add tests that cover all the cases affected by these
changes.
---
 llvm/include/llvm/Analysis/ScalarEvolution.h  |   7 +
 llvm/lib/Analysis/ScalarEvolution.cpp         |  10 +
 .../Transforms/Vectorize/LoopVectorize.cpp    |  48 ++-
 .../AArch64/low_trip_count_predicates.ll      | 397 ++++++++++++++++++
 .../RISCV/riscv-vector-reverse.ll             |   2 +
 5 files changed, 442 insertions(+), 22 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll

diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index 179a2c38d9d3c2..328926f0b7aa65 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -2376,6 +2376,10 @@ class PredicatedScalarEvolution {
   /// Get the (predicated) symbolic max backedge count for the analyzed loop.
   const SCEV *getSymbolicMaxBackedgeTakenCount();
 
+  /// Returns the upper bound of the loop trip count as a normal unsigned
+  /// value, or 0 if the trip count is unknown.
+  unsigned getSmallConstantMaxTripCount();
+
   /// Adds a new predicate.
   void addPredicate(const SCEVPredicate &Pred);
 
@@ -2447,6 +2451,9 @@ class PredicatedScalarEvolution {
 
   /// The symbolic backedge taken count.
   const SCEV *SymbolicMaxBackedgeCount = nullptr;
+
+  /// The constant max trip count for the loop.
+  std::optional<unsigned> SmallConstantMaxTripCount;
 };
 
 template <> struct DenseMapInfo<ScalarEvolution::FoldID> {
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 3d890f05c8ca21..cea3a5bc865fee 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -15050,6 +15050,16 @@ const SCEV *PredicatedScalarEvolution::getSymbolicMaxBackedgeTakenCount() {
   return SymbolicMaxBackedgeCount;
 }
 
+unsigned PredicatedScalarEvolution::getSmallConstantMaxTripCount() {
+  if (!SmallConstantMaxTripCount) {
+    SmallVector<const SCEVPredicate *, 4> Preds;
+    SmallConstantMaxTripCount = SE.getSmallConstantMaxTripCount(&L, &Preds);
+    for (const auto *P : Preds)
+      addPredicate(*P);
+  }
+  return *SmallConstantMaxTripCount;
+}
+
 void PredicatedScalarEvolution::addPredicate(const SCEVPredicate &Pred) {
   if (Preds->implies(&Pred))
     return;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f2bee2c67a2353..05dc58a42249ca 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -411,10 +411,10 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
 ///   3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
 ///   4) Returns std::nullopt if all of the above failed.
 static std::optional<unsigned>
-getSmallBestKnownTC(ScalarEvolution &SE, Loop *L,
+getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L,
                     bool CanUseConstantMax = true) {
   // Check if exact trip count is known.
-  if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
+  if (unsigned ExpectedTC = PSE.getSE()->getSmallConstantTripCount(L))
     return ExpectedTC;
 
   // Check if there is an expected trip count available from profile data.
@@ -426,7 +426,7 @@ getSmallBestKnownTC(ScalarEvolution &SE, Loop *L,
     return std::nullopt;
 
   // Check if upper bound estimate is known.
-  if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
+  if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount())
     return ExpectedTC;
 
   return std::nullopt;
@@ -1789,12 +1789,15 @@ class GeneratedRTChecks {
 
   Loop *OuterLoop = nullptr;
 
+  PredicatedScalarEvolution &PSE;
+
 public:
-  GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
-                    TargetTransformInfo *TTI, const DataLayout &DL,
-                    bool AddBranchWeights)
-      : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
-        MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {}
+  GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
+                    LoopInfo *LI, TargetTransformInfo *TTI,
+                    const DataLayout &DL, bool AddBranchWeights)
+      : DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, "scev.check"),
+        MemCheckExp(*PSE.getSE(), DL, "scev.check"),
+        AddBranchWeights(AddBranchWeights), PSE(PSE) {}
 
   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
   /// accurately estimate the cost of the runtime checks. The blocks are
@@ -1941,7 +1944,7 @@ class GeneratedRTChecks {
 
           // Get the best known TC estimate.
           if (auto EstimatedTC = getSmallBestKnownTC(
-                  *SE, OuterLoop, /* CanUseConstantMax = */ false))
+                  PSE, OuterLoop, /* CanUseConstantMax = */ false))
             BestTripCount = *EstimatedTC;
 
           BestTripCount = std::max(BestTripCount, 1U);
@@ -2272,8 +2275,7 @@ static bool isIndvarOverflowCheckKnownFalse(
   // We know the runtime overflow check is known false iff the (max) trip-count
   // is known and (max) trip-count + (VF * UF) does not overflow in the type of
   // the vector loop induction variable.
-  if (unsigned TC =
-          Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) {
+  if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) {
     uint64_t MaxVF = VF.getKnownMinValue();
     if (VF.isScalable()) {
       std::optional<unsigned> MaxVScale =
@@ -3962,8 +3964,10 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
   }
 
   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
-  unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
+  unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
+  if (TC != MaxTC)
+    LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n');
   if (TC == 1) {
     reportVectorizationFailure("Single iteration (non) loop",
         "loop trip count is one, irrelevant for vectorization",
@@ -4257,7 +4261,7 @@ bool LoopVectorizationPlanner::isMoreProfitable(
   InstructionCost CostA = A.Cost;
   InstructionCost CostB = B.Cost;
 
-  unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop);
+  unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
 
   // Improve estimate for the vector width if it is scalable.
   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
@@ -4852,7 +4856,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
   if (!Legal->isSafeForAnyVectorWidth())
     return 1;
 
-  auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
+  auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop);
   const bool HasReductions = !Legal->getReductionVars().empty();
 
   // If we did not calculate the cost for VF (because the user selected the VF)
@@ -9618,8 +9622,8 @@ static bool processLoopInVPlanNativePath(
   {
     bool AddBranchWeights =
         hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
-    GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
-                             F->getDataLayout(), AddBranchWeights);
+    GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
+                             AddBranchWeights);
     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
                            VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
@@ -9683,7 +9687,7 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
 static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
                                        VectorizationFactor &VF,
                                        std::optional<unsigned> VScale, Loop *L,
-                                       ScalarEvolution &SE,
+                                       PredicatedScalarEvolution &PSE,
                                        ScalarEpilogueLowering SEL) {
   InstructionCost CheckCost = Checks.getCost();
   if (!CheckCost.isValid())
@@ -9768,7 +9772,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
 
   // Skip vectorization if the expected trip count is less than the minimum
   // required trip count.
-  if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) {
+  if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) {
     if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC),
                                 VF.MinProfitableTripCount)) {
       LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
@@ -9875,7 +9879,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
 
   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
   // count by optimizing for size, to minimize overheads.
-  auto ExpectedTC = getSmallBestKnownTC(*SE, L);
+  auto ExpectedTC = getSmallBestKnownTC(PSE, L);
   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
                       << "This loop is worth vectorizing only if no scalar "
@@ -9973,8 +9977,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
 
   bool AddBranchWeights =
       hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
-  GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
-                           F->getDataLayout(), AddBranchWeights);
+  GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
+                           AddBranchWeights);
   if (LVP.hasPlanWithVF(VF.Width)) {
     // Select the interleave count.
     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
@@ -9990,7 +9994,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
         Hints.getForce() == LoopVectorizeHints::FK_Enabled;
     if (!ForceVectorization &&
         !areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L,
-                                    *PSE.getSE(), SEL)) {
+                                    PSE, SEL)) {
       ORE->emit([&]() {
         return OptimizationRemarkAnalysisAliasing(
                    DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
new file mode 100644
index 00000000000000..1ec384b05779a8
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
@@ -0,0 +1,397 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; REQUIRES: asserts
+; RUN: opt -S < %s -p loop-vectorize -debug-only=loop-vectorize -mattr=+sve 2>%t | FileCheck %s
+; RUN: cat %t | FileCheck %s --check-prefix=DEBUG
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; DEBUG-LABEL: LV: Checking a loop in 'low_vf_ic_is_better'
+; DEBUG: LV: Found trip count: 0
+; DEBUG: LV: Found maximum trip count: 19
+; DEBUG: LV: IC is 1
+; DEBUG: LV: VF is vscale x 8
+; DEBUG: Main Loop VF:vscale x 8, Main Loop UF:1, Epilogue Loop VF:vscale x 4, Epilogue Loop UF:1
+
+; DEBUG-LABEL: LV: Checking a loop in 'trip_count_too_small'
+; DEBUG: LV: Found a loop with a very small trip count. This loop is worth vectorizing only if no scalar iteration overheads are incurred.
+; DEBUG: LV: Not vectorizing: The trip count is below the minial threshold value..
+
+; DEBUG-LABEL: LV: Checking a loop in 'too_many_runtime_checks'
+; DEBUG: LV: Found trip count: 0
+; DEBUG: LV: Found maximum trip count: 16
+; DEBUG: LV: Clamping the MaxVF to maximum power of two not exceeding the constant trip count: 16
+; DEBUG: LV: IC is 1
+; DEBUG: LV: VF is 16
+; DEBUG: LV: Vectorization is not beneficial: expected trip count < minimum profitable VF (16 < 32)
+; DEBUG: LV: Too many memory checks needed.
+
+; DEBUG-LABEL: LV: Checking a loop in 'overflow_indvar_known_false'
+; DEBUG: LV: Found trip count: 0
+; DEBUG: LV: Found maximum trip count: 1027
+; DEBUG: LV: can fold tail by masking.
+; DEBUG: Executing best plan with VF=vscale x 16, UF=1
+
+define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef %val) {
+; CHECK-LABEL: define void @low_vf_ic_is_better(
+; CHECK-SAME: ptr nocapture noundef [[P:%.*]], i32 [[TC:%.*]], i16 noundef [[VAL:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP7:%.*]] = icmp ult i32 [[TC]], 19
+; CHECK-NEXT:    br i1 [[CMP7]], label %[[ITER_CHECK:.*]], label %[[WHILE_END:.*]]
+; CHECK:       [[ITER_CHECK]]:
+; CHECK-NEXT:    [[CONV:%.*]] = trunc i16 [[VAL]] to i8
+; CHECK-NEXT:    [[V:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 4
+; CHECK-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[TC]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TC]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = sub i64 20, [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; CHECK:       [[VECTOR_SCEVCHECK]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TC]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = sub i64 19, [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc i64 [[TMP8]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ult i32 [[TMP10]], [[TMP6]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ugt i64 [[TMP8]], 4294967295
+; CHECK-NEXT:    [[TMP13:%.*]] = or i1 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; CHECK:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 8
+; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP3]], [[TMP15]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 8
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], [[TMP17]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 8
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i8> poison, i8 [[CONV]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[TMP0]], [[INDEX]]
+; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[V]], i64 [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP21]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP23:%.*]] = add <vscale x 8 x i8> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    store <vscale x 8 x i8> [[TMP23]], ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]]
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[WHILE_END_LOOPEXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-NEXT:    [[IND_END5:%.*]] = add i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP3]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP32:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP33:%.*]] = mul i64 [[TMP32]], 4
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP33]]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
+; CHECK:       [[VEC_EPILOG_PH]]:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[TMP34:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP35:%.*]] = mul i64 [[TMP34]], 4
+; CHECK-NEXT:    [[N_MOD_VF3:%.*]] = urem i64 [[TMP3]], [[TMP35]]
+; CHECK-NEXT:    [[N_VEC4:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF3]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[TMP0]], [[N_VEC4]]
+; CHECK-NEXT:    [[TMP36:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP37:%.*]] = mul i64 [[TMP36]], 4
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <vscale x 4 x i8> poison, i8 [[CONV]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT9:%.*]] = shufflevector <vscale x 4 x i8> [[BROADCAST_SPLATINSERT8]], <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX7:%.*]] = add i64 [[TMP0]], [[INDEX6]]
+; CHECK-NEXT:    [[TMP38:%.*]] = add i64 [[OFFSET_IDX7]], 0
+; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i8, ptr [[V]], i64 [[TMP38]]
+; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[TMP39]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 4 x i8>, ptr [[TMP40]], align 1
+; CHECK-NEXT:    [[TMP41:%.*]] = add <vscale x 4 x i8> [[WIDE_LOAD7]], [[BROADCAST_SPLAT9]]
+; CHECK-NEXT:    store <vscale x 4 x i8> [[TMP41]], ptr [[TMP40]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT11]] = add nuw i64 [[INDEX6]], [[TMP37]]
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC4]]
+; CHECK-NEXT:    br i1 [[TMP42]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N12:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC4]]
+; CHECK-NEXT:    br i1 [[CMP_N12]], label %[[WHILE_END_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END5]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[ITER_CHECK]] ]
+; CHECK-NEXT:    br label %[[WHILE_BODY:.*]]
+; CHECK:       [[WHILE_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[WHILE_BODY]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP43:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[TMP43]], [[CONV]]
+; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TMP44:%.*]] = and i64 [[INDVARS_IV_NEXT]], 4294967295
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[TMP44]], 19
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[WHILE_END_LOOPEXIT]], label %[[WHILE_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[WHILE_END_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[WHILE_END]]
+; CHECK:       [[WHILE_END]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp7 = icmp ult i32 %tc, 19
+  br i1 %cmp7, label %while.preheader, label %while.end
+
+while.preheader:
+  %conv = trunc i16 %val to i8
+  %v = getelementptr inbounds nuw i8, ptr %p, i64 4
+  %0 = zext nneg i32 %tc to i64
+  br label %while.body
+
+while.body:
+  %iv = phi i64 [ %0, %while.preheader ], [ %iv.next, %while.body ]
+  %iv.next = add nuw nsw i64 %iv, 1
+  %arrayidx = getelementptr inbounds nuw i8, ptr %v, i64 %iv
+  %1 = load i8, ptr %arrayidx, align 1
+  %add = add i8 %1, %conv
+  store i8 %add, ptr %arrayidx, align 1
+  %2 = and i64 %iv.next, 4294967295
+  %exitcond.not = icmp eq i64 %2, 19
+  br i1 %exitcond.not, label %while.end, label %while.body
+
+while.end:
+  ret void
+}
+
+define void @trip_count_too_small(ptr nocapture noundef %p, i32 noundef %tc, i16 noundef %val) {
+; CHECK-LABEL: define void @trip_count_too_small(
+; CHECK-SAME: ptr nocapture noundef [[P:%.*]], i32 noundef [[TC:%.*]], i16 noundef [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP7:%.*]] = icmp ult i32 [[TC]], 3
+; CHECK-NEXT:    br i1 [[CMP7]], label %[[WHILE_PREHEADER:.*]], label %[[WHILE_END:.*]]
+; CHECK:       [[WHILE_PREHEADER]]:
+; CHECK-NEXT:    [[CONV:%.*]] = trunc i16 [[VAL]] to i8
+; CHECK-NEXT:    [[V:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 4
+; CHECK-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[TC]] to i64
+; CHECK-NEXT:    br label %[[WHILE_BODY:.*]]
+; CHECK:       [[WHILE_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], %[[WHILE_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[WHILE_BODY]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP43:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[TMP43]], [[CONV]]
+; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TMP44:%.*]] = and i64 [[INDVARS_IV_NEXT]], 4294967295
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[TMP44]], 3
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[WHILE_END_LOOPEXIT:.*]], label %[[WHILE_BODY]]
+; CHECK:       [[WHILE_END_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[WHILE_END]]
+; CHECK:       [[WHILE_END]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp7 = icmp ult i32 %tc, 3
+  br i1 %cmp7, label %while.preheader, label %while.end
+
+while.preheader:
+  %conv = trunc i16 %val to i8
+  %v = getelementptr inbounds nuw i8, ptr %p, i64 4
+  %0 = zext nneg i32 %tc to i64
+  br label %while.body
+
+while.body:
+  %iv = phi i64 [ %0, %while.preheader ], [ %iv.next, %while.body ]
+  %iv.next = add nuw nsw i64 %iv, 1
+  %arrayidx = getelementptr inbounds nuw i8, ptr %v, i64 %iv
+  %1 = load i8, ptr %arrayidx, align 1
+  %add = add i8 %1, %conv
+  store i8 %add, ptr %arrayidx, align 1
+  %2 = and i64 %iv.next, 4294967295
+  %exitcond.not = icmp eq i64 %2, 3
+  br i1 %exitcond.not, label %while.end, label %while.body
+
+while.end:
+  ret void
+}
+
+define void @too_many_runtime_checks(ptr nocapture noundef %p, ptr nocapture noundef %p1, ptr nocapture noundef readonly %p2, ptr nocapture noundef readonly %p3, i32 noundef %tc, i16 noundef %val) {
+; CHECK-LABEL: define void @too_many_runtime_checks(
+; CHECK-SAME: ptr nocapture noundef [[P:%.*]], ptr nocapture noundef [[P1:%.*]], ptr nocapture noundef readonly [[P2:%.*]], ptr nocapture noundef readonly [[P3:%.*]], i32 noundef [[TC:%.*]], i16 noundef [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP20:%.*]] = icmp ult i32 [[TC]], 16
+; CHECK-NEXT:    br i1 [[CMP20]], label %[[WHILE_PREHEADER:.*]], label %[[WHILE_END:.*]]
+; CHECK:       [[WHILE_PREHEADER]]:
+; CHECK-NEXT:    [[CONV8:%.*]] = trunc i16 [[VAL]] to i8
+; CHECK-NEXT:    [[V:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 4
+; CHECK-NEXT:    [[TMP1:%.*]] = zext nneg i32 [[TC]] to i64
+; CHECK-NEXT:    br label %[[WHILE_BODY:.*]]
+; CHECK:       [[WHILE_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[TMP1]], %[[WHILE_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[WHILE_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[P2]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP60:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[P3]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP61:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[MUL:%.*]] = mul i8 [[TMP61]], [[TMP60]]
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw i8, ptr [[P1]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP62:%.*]] = load i8, ptr [[ARRAYIDX5]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP62]]
+; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX5]], align 1
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP63:%.*]] = load i8, ptr [[ARRAYIDX10]], align 1
+; CHECK-NEXT:    [[ADD12:%.*]] = add i8 [[TMP63]], [[CONV8]]
+; CHECK-NEXT:    store i8 [[ADD12]], ptr [[ARRAYIDX10]], align 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[TMP64:%.*]] = and i64 [[INDVARS_IV_NEXT]], 4294967295
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[TMP64]], 16
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[WHILE_END_LOOPEXIT:.*]], label %[[WHILE_BODY]]
+; CHECK:       [[WHILE_END_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[WHILE_END]]
+; CHECK:       [[WHILE_END]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp20 = icmp ult i32 %tc, 16
+  br i1 %cmp20, label %while.preheader, label %while.end
+
+while.preheader:
+  %0 = trunc i16 %val to i8
+  %v = getelementptr inbounds nuw i8, ptr %p, i64 4
+  %1 = zext nneg i32 %tc to i64
+  br label %while.body
+
+while.body:
+  %iv = phi i64 [ %1, %while.preheader ], [ %iv.next, %while.body ]
+  %arrayidx = getelementptr inbounds nuw i8, ptr %p2, i64 %iv
+  %2 = load i8, ptr %arrayidx, align 1
+  %arrayidx2 = getelementptr inbounds nuw i8, ptr %p3, i64 %iv
+  %3 = load i8, ptr %arrayidx2, align 1
+  %mul = mul i8 %3, %2
+  %arrayidx5 = getelementptr inbounds nuw i8, ptr %p1, i64 %iv
+  %4 = load i8, ptr %arrayidx5, align 1
+  %add = add i8 %mul, %4
+  store i8 %add, ptr %arrayidx5, align 1
+  %arrayidx10 = getelementptr inbounds nuw i8, ptr %v, i64 %iv
+  %5 = load i8, ptr %arrayidx10, align 1
+  %add12 = add i8 %5, %0
+  store i8 %add12, ptr %arrayidx10, align 1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %6 = and i64 %iv.next, 4294967295
+  %exitcond.not = icmp eq i64 %6, 16
+  br i1 %exitcond.not, label %while.end, label %while.body
+
+while.end:
+  ret void
+}
+
+define void @overflow_indvar_known_false(ptr nocapture noundef %p, i32 noundef %tc, i16 noundef %val) vscale_range(1,16) {
+; CHECK-LABEL: define void @overflow_indvar_known_false(
+; CHECK-SAME: ptr nocapture noundef [[P:%.*]], i32 noundef [[TC:%.*]], i16 noundef [[VAL:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP7:%.*]] = icmp ult i32 [[TC]], 1027
+; CHECK-NEXT:    br i1 [[CMP7]], label %[[WHILE_PREHEADER:.*]], label %[[WHILE_END:.*]]
+; CHECK:       [[WHILE_PREHEADER]]:
+; CHECK-NEXT:    [[CONV:%.*]] = trunc i16 [[VAL]] to i8
+; CHECK-NEXT:    [[V:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 4
+; CHECK-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[TC]] to i64
+; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[TC]], 1
+; CHECK-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP19]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 1028, [[TMP20]]
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; CHECK:       [[VECTOR_SCEVCHECK]]:
+; CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[TC]], 1
+; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP21]] to i64
+; CHECK-NEXT:    [[TMP23:%.*]] = sub i64 1027, [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
+; CHECK-NEXT:    [[TMP25:%.*]] = add i32 [[TMP21]], [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp ult i32 [[TMP25]], [[TMP21]]
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp ugt i64 [[TMP23]], 4294967295
+; CHECK-NEXT:    [[TMP28:%.*]] = or i1 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    br i1 [[TMP28]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP1]], [[TMP4]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 16
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[TMP1]])
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[CONV]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[TMP0]], [[INDEX]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[V]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP14]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 16 x i8> [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP15]], ptr [[TMP14]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 [[TMP1]])
+; CHECK-NEXT:    [[TMP16:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 16 x i1> [[TMP16]], i32 0
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, label %[[WHILE_END_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[WHILE_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    br label %[[WHILE_BODY:.*]]
+; CHECK:       [[WHILE_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[WHILE_BODY]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[TMP18]], [[CONV]]
+; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TMP29:%.*]] = and i64 [[INDVARS_IV_NEXT]], 4294967295
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[TMP29]], 1027
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[WHILE_END_LOOPEXIT]], label %[[WHILE_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[WHILE_END_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[WHILE_END]]
+; CHECK:       [[WHILE_END]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp7 = icmp ult i32 %tc, 1027
+  br i1 %cmp7, label %while.preheader, label %while.end
+
+while.preheader:
+  %conv = trunc i16 %val to i8
+  %v = getelementptr inbounds nuw i8, ptr %p, i64 4
+  %0 = zext nneg i32 %tc to i64
+  br label %while.body
+
+while.body:
+  %iv = phi i64 [ %0, %while.preheader ], [ %iv.next, %while.body ]
+  %iv.next = add nuw nsw i64 %iv, 1
+  %arrayidx = getelementptr inbounds nuw i8, ptr %v, i64 %iv
+  %1 = load i8, ptr %arrayidx, align 1
+  %add = add i8 %1, %conv
+  store i8 %add, ptr %arrayidx, align 1
+  %2 = and i64 %iv.next, 4294967295
+  %exitcond.not = icmp eq i64 %2, 1027
+  br i1 %exitcond.not, label %while.end, label %while.body, !llvm.loop !0
+
+while.end:
+  ret void
+}
+
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index 1d5e6c117a2eac..9a716f7756072e 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -20,6 +20,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: We can vectorize this loop (with a runtime bound check)!
 ; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  LV: Found trip count: 0
+; CHECK-NEXT:  LV: Found maximum trip count: 4294967295
 ; CHECK-NEXT:  LV: Scalable vectorization is available
 ; CHECK-NEXT:  LV: The max safe fixed VF is: 67108864.
 ; CHECK-NEXT:  LV: The max safe scalable VF is: vscale x 4294967295.
@@ -224,6 +225,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: We can vectorize this loop (with a runtime bound check)!
 ; CHECK-NEXT:  LV: Loop does not require scalar epilogue
 ; CHECK-NEXT:  LV: Found trip count: 0
+; CHECK-NEXT:  LV: Found maximum trip count: 4294967295
 ; CHECK-NEXT:  LV: Scalable vectorization is available
 ; CHECK-NEXT:  LV: The max safe fixed VF is: 67108864.
 ; CHECK-NEXT:  LV: The max safe scalable VF is: vscale x 4294967295.

From 1276ce9e9713b2a0802004676fad7e40980396d5 Mon Sep 17 00:00:00 2001
From: Emilio Cota <ecg@google.com>
Date: Fri, 11 Oct 2024 05:08:23 -0400
Subject: [PATCH 142/177] Revert "[mlir][linalg] Introduce transpose semantic
 to 'linalg.matmul' ops. (#104783)"

This reverts commit 03483737a7a2d72a257a5ab6ff01748ad9cf0f75 and
99c8557, which is a fix-up on top of the former.

I'm reverting because this commit broke two tests:
  mlir/test/python/integration/dialects/linalg/opsrun.py
  mlir/test/python/integration/dialects/transform.py
See https://lab.llvm.org/buildbot/#/builders/138/builds/4872

I'm not familiar with the tests, so I'm leaving it to the original author
to either remove or adapt the broken tests, as discussed here:
  https://github.com/llvm/llvm-project/pull/104783#issuecomment-2406390905
---
 .../Dialect/Linalg/IR/LinalgInterfaces.td     |  10 -
 .../Linalg/IR/LinalgNamedStructuredOps.yaml   |  72 +++++
 .../Dialect/Linalg/IR/LinalgStructuredOps.td  | 134 ---------
 .../Dialect/Linalg/IR/LinalgInterfaces.cpp    |  17 +-
 mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp      | 263 +-----------------
 .../Linalg/Transforms/TransposeMatmul.cpp     |   7 -
 .../Linalg/Transforms/Vectorization.cpp       |   5 -
 .../NVGPU/TransformOps/NVGPUTransformOps.cpp  |   6 -
 .../linalg/opdsl/ops/core_named_ops.py        |  17 ++
 .../Dialect/Linalg/generalize-named-ops.mlir  | 111 --------
 mlir/test/Dialect/Linalg/invalid.mlir         | 159 -----------
 mlir/test/Dialect/Linalg/named-ops.mlir       | 243 ----------------
 mlir/test/python/dialects/linalg/ops.py       |  75 +++++
 .../mlir-linalg-ods-yaml-gen.cpp              |   6 +-
 14 files changed, 182 insertions(+), 943 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td
index e80dbb2afb9ef7..fbf3f19cde0e9b 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td
@@ -684,16 +684,6 @@ def LinalgStructuredInterface
         return;
       }]
     >,
-    InterfaceMethod<
-      /*desc=*/[{
-        Return true if the user has supplied an explicit indexing maps for this op.
-      }],
-      /*retTy=*/"bool",
-      /*methodName=*/"hasUserDefinedMaps",
-      /*args=*/(ins),
-      /*methodBody=*/"",
-      /*defaultImplementation=*/[{ return false; }]
-    >,
     //===------------------------------------------------------------------===//
     // Linalg generalization hooks.
     //===------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
index 97b90333e2b200..8cb698096ef5b7 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
@@ -1065,6 +1065,78 @@ structured_op: !LinalgStructuredOpConfig
         - !ScalarExpression
           scalar_arg: rhs
 --- !LinalgOpConfig
+metadata: !LinalgOpMetadata
+  name: matmul
+  cpp_class_name: MatmulOp
+  doc: |-
+    Performs a matrix multiplication of two 2D inputs.
+
+    Numeric casting is performed on the operands to the inner multiply, promoting
+    them to the same data type as the accumulator/output.
+  implements:
+  - LinalgContractionOpInterface
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: A
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<()[s0, s1, s2] -> (s0, s1)>
+  - !LinalgOperandDefConfig
+    name: B
+    kind: input_tensor
+    type_var: T2
+    shape_map: affine_map<()[s0, s1, s2] -> (s1, s2)>
+  - !LinalgOperandDefConfig
+    name: C
+    kind: output_tensor
+    type_var: U
+    shape_map: affine_map<()[s0, s1, s2] -> (s0, s2)>
+  - !LinalgOperandDefConfig
+    name: cast
+    kind: type_fn_attr
+    default_fn: cast_signed
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<(d0, d1, d2)[s0, s1, s2] -> (d0, d2)>
+    - affine_map<(d0, d1, d2)[s0, s1, s2] -> (d2, d1)>
+    - affine_map<(d0, d1, d2)[s0, s1, s2] -> (d0, d1)>
+  iterator_types:
+  - parallel
+  - parallel
+  - reduction
+  assignments:
+  - !ScalarAssign
+    arg: C
+    value: !ScalarExpression
+      scalar_fn:
+        kind: binary
+        fn_name: add
+        operands:
+        - !ScalarExpression
+          scalar_arg: C
+        - !ScalarExpression
+          scalar_fn:
+            kind: binary
+            fn_name: mul
+            operands:
+            - !ScalarExpression
+              scalar_fn:
+                kind: type
+                attr_name: cast
+                type_var: U
+                operands:
+                - !ScalarExpression
+                  scalar_arg: A
+            - !ScalarExpression
+              scalar_fn:
+                kind: type
+                attr_name: cast
+                type_var: U
+                operands:
+                - !ScalarExpression
+                  scalar_arg: B
+--- !LinalgOpConfig
 metadata: !LinalgOpMetadata
   name: quantized_matmul
   cpp_class_name: QuantizedMatmulOp
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
index 61d4fc9734c6de..31f29139247267 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td
@@ -535,140 +535,6 @@ def BroadcastOp : LinalgStructuredBase_Op<"broadcast", [
   let hasCanonicalizer = 1;
 }
 
-//===----------------------------------------------------------------------===//
-// Op definition for MatmulOp
-//===----------------------------------------------------------------------===//
-
-def MatmulOp : LinalgStructuredBase_Op<"matmul", [
-               AttrSizedOperandSegments,
-               LinalgContractionOpInterface]> {
-    
-  let summary = [{
-    Performs a matrix multiplication of two 2D inputs without broadcast or transpose.
-    }];
-  let description = [{
-    Numeric casting is performed on the operands to the inner multiply,
-    promoting them to the same data type as the accumulator/output.
-
-    Broadcast and Transpose semantics can be appiled by specifying the explicit attribute
-    'indexing_maps' as shown below.This is a list attribute, so the list must include all
-    the maps if specified.
-
-    Example Transpose:
-    ```
-    linalg.matmul indexing_maps = [
-                   affine_map<(d0, d1, d2) -> (d2, d0)>, // transpose
-                   affine_map<(d0, d1, d2) -> (d2, d1)>,
-                   affine_map<(d0, d1, d2) -> (d0, d1)>
-                   ]
-                   ins(%arg0, %arg1 : memref<5x3xf32>,memref<5x7xf32>)
-                   outs(%arg2: memref<3x7xf32>)
-     ```
-
-    Example Broadcast:
-     ```
-    linalg.matmul indexing_maps = [
-                   affine_map<(d0, d1, d2) -> (d2)>,     // broadcast
-                   affine_map<(d0, d1, d2) -> (d2, d1)>,
-                   affine_map<(d0, d1, d2) -> (d0, d1)>
-                  ]
-                  ins(%arg0, %arg1 : memref<3xf32>, memref<5x7xf32>)
-                  outs(%arg2: memref<3x7xf32>)
-     ```
-
-     Example Broadcast and transpose:
-     ```
-     linalg.matmul indexing_maps = [
-                       affine_map<(d0, d1, d2) -> (d2, d0)>, // transpose
-                       affine_map<(d0, d1, d2) -> (d2)>,     // broadcast
-                       affine_map<(d0, d1, d2) -> (d0, d1)>
-                     ]
-                     ins(%arg0, %arg1 : memref<5x3xf32>, memref<7xf32>) outs(%arg2: memref<3x7xf32>)
-    }];
-
-    let arguments = (ins
-      Variadic<AnyType>:$inputs,
-      Variadic<AnyShaped>:$outputs,
-      DefaultValuedOptionalAttr<AffineMapArrayAttr, "{}">:$indexing_maps,
-      DefaultValuedOptionalAttr<TypeFnAttr, "TypeFn::cast_signed">:$cast
-    );
-    let results = (outs Variadic<AnyRankedTensor>:$result_tensors);
-    let regions = (region AnyRegion:$region);
-
-    let skipDefaultBuilders = 1;
-    let builders = [
-      OpBuilder<
-      (ins "ValueRange":$inputs, "ValueRange":$outputs,
-            CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes),
-      [{
-        buildStructuredOp($_builder, $_state, std::nullopt, inputs, outputs,
-          attributes, MatmulOp::getRegionBuilder());
-      }]>,
-      OpBuilder<
-      (ins "TypeRange":$resultTensorTypes, "ValueRange":$inputs,
-            "ValueRange":$outputs,
-            CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes),
-      [{
-        buildStructuredOp($_builder, $_state, resultTensorTypes,
-          inputs, outputs, attributes, MatmulOp::getRegionBuilder());
-      }]>,
-      OpBuilder<
-      (ins "TypeRange":$resultTensorTypes, "ValueRange":$operands,
-            CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes),
-      [{
-        $_state.addOperands(operands);
-        $_state.addAttributes(attributes);
-        $_state.addTypes(resultTensorTypes);
-        (void)$_state.addRegion();
-      }]>,
-      OpBuilder<
-      (ins "TypeRange":$resultTensorTypes, "ValueRange":$inputs,
-       "ValueRange":$outputs,
-       "Attribute":$cast, CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes),
-      [{
-        $_state.addAttribute("cast", cast);
-        buildStructuredOp($_builder, $_state, resultTensorTypes, inputs, outputs,
-          attributes, MatmulOp::getRegionBuilder());
-      }]>
-
-    ];
-    let hasCustomAssemblyFormat = 1;
-    let hasFolder = 1;
-    let hasVerifier = 1;
-
-    let extraClassDeclaration = structuredOpsBaseDecls # [{
-      SmallVector<utils::IteratorType> getIteratorTypesArray();
-
-      /// Implements the block region builder.
-      static void regionBuilder(ImplicitLocOpBuilder &b,
-                                Block &block, ArrayRef<NamedAttribute> attrs);
-
-      /// Returns a list of AffineMap with the typical matmul indexing charactristic.
-      SmallVector<AffineMap> getDefaultIndexingMaps();
-
-      /// Returns true if the given broadcast map \p bcastMap is valid for this op.
-      bool isValidLhsRhsBroadcastMap(AffineMap bcastMap);
-
-      static std::function<void(ImplicitLocOpBuilder &,
-                                Block &, ArrayRef<NamedAttribute>)>
-      getRegionBuilder() {
-        return regionBuilder;
-      }
-
-      ::mlir::MutableOperandRange getDpsInitsMutable() {
-        return getOutputsMutable();
-      }
-
-      // Generic methods.
-      static unsigned getNumRegionArgs();
-      std::string getLibraryCallName();
-      bool hasDynamicIndexingMaps();
-      /// Check if the op has broadcast and/or transpose semantic. Returns true if the
-      /// user defined indexing maps are not equal to default map.
-      bool hasUserDefinedMaps();
-    }];
-}
-
 //===----------------------------------------------------------------------===//
 // Named Linalg ops, implemented as a declarative configurations of generic ops.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
index 3b9194098fa783..40795879c3026d 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
@@ -15,20 +15,13 @@
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineExprVisitor.h"
 #include "mlir/IR/AffineMap.h"
-#include "mlir/IR/BuiltinTypeInterfaces.h"
-#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/TypeUtilities.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/raw_ostream.h"
 #include <algorithm>
-#include <optional>
 
 using namespace mlir;
 using namespace mlir::linalg;
@@ -1149,6 +1142,7 @@ int64_t LinalgOp::getIndexingMapIndex(OpOperand *opOperand) {
 
 LogicalResult mlir::linalg::detail::verifyStructuredOpInterface(Operation *op) {
   LinalgOp linalgOp = cast<LinalgOp>(op);
+
   // Mixed tensor/buffer operands are not allowed.
   if (!linalgOp.hasPureTensorSemantics() &&
       !linalgOp.hasPureBufferSemantics() && op->getNumOperands() > 0)
@@ -1168,8 +1162,6 @@ LogicalResult mlir::linalg::detail::verifyStructuredOpInterface(Operation *op) {
            << ") to be equal to the number of input/output operands ("
            << linalgOp->getNumOperands() << ")";
 
-  // Set this flag if this op has user defined maps. This is required to guard
-  // the below error condition which assume default indexing maps.
   for (OpOperand &opOperand : linalgOp->getOpOperands()) {
     AffineMap indexingMap = linalgOp.getMatchingIndexingMap(&opOperand);
 
@@ -1186,13 +1178,13 @@ LogicalResult mlir::linalg::detail::verifyStructuredOpInterface(Operation *op) {
              << " dim(s) to match the number of loops";
 
     int64_t rank = linalgOp.getRank(&opOperand);
-
     if (indexingMap.getNumResults() != rank)
       return op->emitOpError("expected operand rank (")
              << rank << ") to match the result rank of indexing_map #"
              << opOperand.getOperandNumber() << " ("
              << indexingMap.getNumResults() << ")";
   }
+
   SmallVector<unsigned> redDims;
   linalgOp.getReductionDims(redDims);
 
@@ -1202,8 +1194,9 @@ LogicalResult mlir::linalg::detail::verifyStructuredOpInterface(Operation *op) {
   // Check if given shapes match to inferred shapes.
   SmallVector<int64_t, 4> endLoopRangeValues = linalgOp.getStaticLoopRanges();
   SmallVector<int64_t, 4> startLoopRangeValues(endLoopRangeValues.size(), 0);
-  // Verify only static cases since we can't get exact dimension sizes and
-  // loop ranges for dynamic cases in this stage.
+
+  // Verify only static cases since we can't get exact dimension sizes and loop
+  // ranges for dynamic cases in this stage.
   if (llvm::none_of(endLoopRangeValues, ShapedType::isDynamic)) {
     for (int64_t &range : endLoopRangeValues)
       range -= 1;
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index c909d13e4314b4..730c478c2883ef 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -27,7 +27,6 @@
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/IR/AffineExprVisitor.h"
 #include "mlir/IR/AffineMap.h"
-#include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypeInterfaces.h"
 #include "mlir/IR/Matchers.h"
@@ -38,17 +37,12 @@
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/LogicalResult.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include <cassert>
 #include <optional>
 
 using namespace mlir;
@@ -155,36 +149,15 @@ static void fillStructuredOpRegion(OpBuilder &opBuilder, Region &region,
   // iterator_types is an auto-generated method.
 }
 
-/// Helper to create a typical indexing map for MatmulOp. Returns a list of
-/// AffineMap.
-static SmallVector<AffineMap, 3>
-getDefaultIndexingMapsForMatmul(MLIRContext *context) {
-  AffineExpr d0, d1, d2;
-  SmallVector<AffineMap, 3> indexingMaps;
-  bindDims(context, d0, d1, d2);
-  indexingMaps.push_back(AffineMap::get(3, 0, {d0, d2}, context));
-  indexingMaps.push_back(AffineMap::get(3, 0, {d2, d1}, context));
-  indexingMaps.push_back(AffineMap::get(3, 0, {d0, d1}, context));
-  return indexingMaps;
-}
-
-/// Wrapper to return the typical indexing map array attribute for MatmulOp.
-static SmallVector<Attribute> getDefaultIndexingMapAttr(MLIRContext *context) {
-  return llvm::map_to_vector(
-      getDefaultIndexingMapsForMatmul(context),
-      [](AffineMap map) -> Attribute { return AffineMapAttr::get(map); });
-}
-
 /// Creates a structured operation given `inputs`, `outputs`, and `attributes`.
 /// The result types are derived automatically if `resultTensorTypes` is none.
 /// The body of the operation is filled using `regionBuilder`. All ods-gen
 /// created structured operations use the method to implement their builders.
-static void buildStructuredOp(
-    OpBuilder &b, OperationState &state,
-    std::optional<TypeRange> resultTensorTypes, ValueRange inputs,
-    ValueRange outputs, ArrayRef<NamedAttribute> attributes,
-    RegionBuilderFn regionBuilder,
-    std::optional<ArrayRef<AffineMap>> indexingMaps = std::nullopt) {
+static void buildStructuredOp(OpBuilder &b, OperationState &state,
+                              std::optional<TypeRange> resultTensorTypes,
+                              ValueRange inputs, ValueRange outputs,
+                              ArrayRef<NamedAttribute> attributes,
+                              RegionBuilderFn regionBuilder) {
   // Derive the result types if needed.
   SmallVector<Type> derivedResultTypes =
       resultTensorTypes.value_or(TypeRange());
@@ -195,20 +168,6 @@ static void buildStructuredOp(
   state.addOperands(inputs);
   state.addOperands(outputs);
   state.addTypes(derivedResultTypes);
-
-  // Initialize indexingMaps, for MatmulOp.
-  SmallVector<Attribute, 3> indexingMapsAttrVal;
-  if (indexingMaps.has_value()) {
-    for (mlir::AffineMap map : *indexingMaps) {
-      // Convert each AffineMap to an AffineMapAttr
-      indexingMapsAttrVal.push_back(AffineMapAttr::get(map));
-    }
-    state.addAttribute("indexing_maps", b.getArrayAttr(indexingMapsAttrVal));
-  } else {
-    indexingMapsAttrVal = getDefaultIndexingMapAttr(b.getContext());
-    state.addAttribute("indexing_maps", b.getArrayAttr(indexingMapsAttrVal));
-  }
-
   state.addAttributes(attributes);
   state.addAttribute(
       "operandSegmentSizes",
@@ -340,48 +299,11 @@ static ParseResult parseNamedStructuredOp(OpAsmParser &parser,
                                           OperationState &result,
                                           unsigned numRegionArgs,
                                           RegionBuilderFn regionBuilder) {
-
-  SmallVector<Attribute, 3> indexingMapsAttr;
-  Attribute mapAttr;
-  if (succeeded(parser.parseOptionalKeyword("indexing_maps"))) {
-    if (parser.parseEqual())
-      return failure();
-
-    if (parser.parseLSquare())
-      return failure();
-
-    do {
-      if (parser.parseAttribute(mapAttr))
-        return failure();
-      if (!isa<AffineMapAttr>(mapAttr)) {
-        return parser.emitError(parser.getCurrentLocation(),
-                                "expected affine map attribute");
-      }
-      indexingMapsAttr.push_back(mapAttr);
-
-      if (parser.parseOptionalComma())
-        break;
-    } while (true);
-
-    if (parser.parseRSquare())
-      return failure();
-  }
-  // Initialize indexingMaps, if not supplied explicitly.
-  if (indexingMapsAttr.empty()) {
-    indexingMapsAttr = getDefaultIndexingMapAttr(result.getContext());
-  }
-  result.addAttribute("indexing_maps",
-                      parser.getBuilder().getArrayAttr(indexingMapsAttr));
-
   // TODO: Enable when ods-gen supports captures.
   SmallVector<Type, 1> inputTypes, outputTypes;
   if (parseCommonStructuredOpParts(parser, result, inputTypes, outputTypes))
     return failure();
 
-  // Parse optional attributes.
-  if (parser.parseOptionalAttrDict(result.attributes))
-    return failure();
-
   // TODO: consider merging results parsing into region parsing.
   // Need to wait for declarative assembly resolution to decide.
   SmallVector<Type, 1> outputTensorsTypes;
@@ -407,9 +329,13 @@ static void printNamedStructuredOpResults(OpAsmPrinter &p,
 }
 
 static void printNamedStructuredOp(OpAsmPrinter &p, Operation *op,
-                                   ValueRange inputs, ValueRange outputs,
-                                   ArrayRef<StringRef> elidedAttrs = {}) {
-  p.printOptionalAttrDict(op->getAttrs(), elidedAttrs);
+                                   ValueRange inputs, ValueRange outputs) {
+  p.printOptionalAttrDict(
+      op->getAttrs(),
+      /*elidedAttrs=*/{"operandSegmentSizes",
+                       // See generated code in
+                       // LinalgNamedStructuredOps.yamlgen.cpp.inc
+                       "linalg.memoized_indexing_maps"});
 
   // Printing is shared with generic ops, except for the region and
   // attributes.
@@ -3456,168 +3382,3 @@ Operation *LinalgDialect::materializeConstant(OpBuilder &builder,
                                               Location loc) {
   return arith::ConstantOp::materialize(builder, value, type, loc);
 }
-
-/// Returns true if the result AffineExpr of the \p explicitMap is same as \p
-/// defaultMap.
-static bool isValidResultDimExprs(AffineMap explictMap, AffineMap defaultMap) {
-  auto explicitRange = explictMap.getResults();
-  auto defaultRange = defaultMap.getResults();
-  DenseSet<AffineExpr> explicitSet(explicitRange.begin(), explicitRange.end());
-  DenseSet<AffineExpr> defaultSet(defaultRange.begin(), defaultRange.end());
-  llvm::set_union(explicitSet, defaultSet);
-  return explicitSet == defaultSet;
-}
-
-/// Returns true if the \p explictMap is broadcasted with respect to the
-/// \p defaultMap.
-static bool isBroadcasted(AffineMap explictMap, AffineMap defaultMap) {
-  return explictMap.getNumResults() < defaultMap.getNumResults();
-}
-
-/// Verifies the broadcast and transpose semantic sepecified by the explicit
-/// indexing map for the MatmulOp \p op for each operand specified by \p
-/// opIndex.
-static LogicalResult verifyExtendedMatmulSemantic(MatmulOp matmulOp,
-                                                  unsigned opIndex) {
-  SmallVector<AffineMap, 3> opIndexingMaps = matmulOp.getIndexingMapsArray();
-  SmallVector<AffineMap, 3> defaultIndexingMaps =
-      matmulOp.getDefaultIndexingMaps();
-
-  auto opIndexingMap = opIndexingMaps[opIndex];
-  auto defaultIndexingMap = defaultIndexingMaps[opIndex];
-  // Check general validity of indexing map results.
-  if (!isValidResultDimExprs(opIndexingMap, defaultIndexingMap))
-    return matmulOp->emitOpError()
-           << "Unexpected dim expression in map result.";
-
-  // Check if the requested broadcast is valid.
-  if (isBroadcasted(opIndexingMap, defaultIndexingMap)) {
-    if (!matmulOp.isValidLhsRhsBroadcastMap(opIndexingMap)) {
-      return matmulOp->emitOpError()
-             << "Invalid broadcast requested, should be (d2).";
-    }
-    return success();
-  }
-  return success();
-}
-
-namespace mlir {
-namespace linalg {
-//===----------------------------------------------------------------------===//
-// MatMulOp
-//===----------------------------------------------------------------------===//
-SmallVector<utils::IteratorType> MatmulOp::getIteratorTypesArray() {
-  return SmallVector<utils::IteratorType>{utils::IteratorType::parallel,
-                                          utils::IteratorType::parallel,
-                                          utils::IteratorType::reduction};
-}
-
-unsigned MatmulOp::getNumRegionArgs() { return 3; }
-
-std::string MatmulOp::getLibraryCallName() {
-  return generateLibraryCallName(getOperation());
-}
-
-bool MatmulOp::hasDynamicIndexingMaps() { return true; }
-
-/// Check if the op has broadcast and/or transpose semantic. Returns true if the
-/// user defined indexing maps are not equal to default map.
-bool MatmulOp::hasUserDefinedMaps() {
-  SmallVector<AffineMap, 3> defaultMaps = getDefaultIndexingMaps();
-  SmallVector<AffineMap, 3> explicitMaps = getIndexingMapsArray();
-  return defaultMaps != explicitMaps;
-}
-
-/// Implements the block region builder for the MatmulOp. This is called by
-/// 'fillStructuredOpRegion'.
-void MatmulOp::regionBuilder(ImplicitLocOpBuilder &b, Block &block,
-                             ArrayRef<NamedAttribute> attrs) {
-  assert(3 > 0 && block.getNumArguments() == 3 &&
-         "MatmulOp regionBuilder expects 3 (>=0) args");
-  RegionBuilderHelper helper(b, block);
-  SmallVector<Value> yields;
-
-  TypeFn castVal = TypeFn::cast_signed;
-  auto castIter = llvm::find_if(attrs, [&](const NamedAttribute &attr) {
-    return attr.getName() == "cast";
-  });
-  if (castIter != attrs.end()) {
-    if (auto attr = llvm::dyn_cast<TypeFnAttr>(castIter->getValue()))
-      castVal = attr.getValue();
-  }
-
-  Value value1 = helper.buildTypeFn(castVal, block.getArgument(2).getType(),
-                                    block.getArgument(0));
-  Value value2 = helper.buildTypeFn(castVal, block.getArgument(2).getType(),
-                                    block.getArgument(1));
-  Value value3 = helper.buildBinaryFn(BinaryFn::mul, value1, value2);
-  Value value4 =
-      helper.buildBinaryFn(BinaryFn::add, block.getArgument(2), value3);
-  yields.push_back(value4);
-  helper.yieldOutputs(yields);
-}
-
-/// Returns a list of AffineMap with the typical matmul indexing charactristic.
-SmallVector<AffineMap> MatmulOp::getDefaultIndexingMaps() {
-  MLIRContext *context = this->getContext();
-  return getDefaultIndexingMapsForMatmul(context);
-}
-
-/// Returns true if the given broadcast map \p bcastMap is valid for this op.
-bool MatmulOp::isValidLhsRhsBroadcastMap(AffineMap bcastMap) {
-  assert(bcastMap.getNumResults() == 1 && "Expected single result dim expr.");
-  AffineExpr exp = bcastMap.getResult(0);
-  // Invalid map if the common dimension of matmul not found.
-  return exp.isFunctionOfDim(bcastMap.getNumDims() - 1);
-}
-
-ParseResult MatmulOp::parse(OpAsmParser &parser, OperationState &result) {
-  return parseNamedStructuredOp(parser, result, MatmulOp::getNumRegionArgs(),
-                                MatmulOp::getRegionBuilder());
-}
-void MatmulOp::print(OpAsmPrinter &p) {
-  SmallVector<StringRef, 3> elidedAttrs = {
-      "operandSegmentSizes", "linalg.memoized_indexing_maps", "indexing_maps"};
-  printNamedStructuredOp(p, getOperation(), getInputs(), getOutputs(),
-                         elidedAttrs);
-
-  SmallVector<Attribute, 3> indexingMaps =
-      getDefaultIndexingMapAttr(getContext());
-  if (!llvm::equal(getIndexingMaps(), indexingMaps)) {
-    p << " indexing_maps = [";
-    llvm::interleaveComma(getIndexingMaps(), p,
-                          [&](Attribute attr) { p.printAttribute(attr); });
-    p << "]";
-  }
-}
-
-/// Verify the user defined indexing maps.
-LogicalResult MatmulOp::verify() {
-  // Verification of pure matmul is handled by verifyStructuredOpInterface().
-  if (!hasUserDefinedMaps())
-    return success();
-
-  for (unsigned opIndex = 0; opIndex < 2; opIndex++) {
-    if (failed(verifyExtendedMatmulSemantic(*this, opIndex)))
-      return failure();
-  }
-  return success();
-}
-
-LogicalResult MatmulOp::fold(FoldAdaptor, SmallVectorImpl<OpFoldResult> &) {
-  return memref::foldMemRefCast(*this);
-}
-void MatmulOp::getEffects(
-    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
-        &effects) {
-  if (hasPureTensorSemantics())
-    return;
-  getGenericEffectsImpl(effects, cast<LinalgOp>(getOperation()));
-}
-
-Speculation::Speculatability MatmulOp::getSpeculatability() {
-  return getGenericSpeculatabilityImpl(cast<LinalgOp>(getOperation()));
-}
-
-} // namespace linalg
-} // namespace mlir
diff --git a/mlir/lib/Dialect/Linalg/Transforms/TransposeMatmul.cpp b/mlir/lib/Dialect/Linalg/Transforms/TransposeMatmul.cpp
index 6b934f7e8157d4..aa0052ce47fa7b 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/TransposeMatmul.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/TransposeMatmul.cpp
@@ -31,13 +31,6 @@ using namespace mlir::linalg;
 FailureOr<Operation *> mlir::linalg::transposeMatmul(RewriterBase &rewriter,
                                                      linalg::MatmulOp matmulOp,
                                                      bool transposeLHS) {
-  // Check to not let go the matmul with extended semantic, through this
-  // transform.
-  if (matmulOp.hasUserDefinedMaps()) {
-    return rewriter.notifyMatchFailure(
-        matmulOp, "only matmul ops with non-extended semantics are supported");
-  }
-
   if (!bufferization::hasTensorSemantics(matmulOp))
     return rewriter.notifyMatchFailure(
         matmulOp, "only matmul ops with tensors are supported");
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index e3f010d9cfb20b..09c6b2683b4388 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -2071,11 +2071,6 @@ vectorizeScalableVectorPrecondition(Operation *op,
       return failure();
   }
 
-  // Check to not let go the matmul with extended semantic, through this
-  // transform.
-  if (linalgOp.hasUserDefinedMaps())
-    return failure();
-
   // Cond 4: Only the following ops are supported in the
   // presence of scalable vectors
   return success(isElementwise(linalgOp) || isa<linalg::MatmulOp>(op) ||
diff --git a/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp b/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp
index 3c508ed6e324b2..0c2275bbc4b224 100644
--- a/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp
+++ b/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp
@@ -821,12 +821,6 @@ DiagnosedSilenceableFailure transform::RewriteMatmulAsMmaSyncOp::applyToOne(
   bool fail = true;
   // TODO: more robust detection of matmulOp, with transposes etc.
   if (isa_and_nonnull<linalg::MatmulOp>(linalgOp.getOperation())) {
-    // Check to not let go the matmul with extended semantic, through this
-    // transform.
-    if (linalgOp.hasUserDefinedMaps()) {
-      return emitSilenceableError()
-             << "only matmul ops with non-extended semantics are supported";
-    }
     Location loc = linalgOp.getLoc();
     // TODO: more robust computation of laneId, for now assume a single warp.
     Value laneId = rewriter.create<gpu::ThreadIdOp>(
diff --git a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
index d5e79b4d3cb6dd..e4a6ec7487bb2f 100644
--- a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
+++ b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
@@ -383,6 +383,23 @@ def select(
     O[None] = TernaryFn.select(cond[None], lhs[None], rhs[None])
 
 
+@linalg_structured_op
+def matmul(
+    A=TensorDef(T1, S.M, S.K),
+    B=TensorDef(T2, S.K, S.N),
+    C=TensorDef(U, S.M, S.N, output=True),
+    cast=TypeFnAttrDef(default=TypeFn.cast_signed),
+):
+    """Performs a matrix multiplication of two 2D inputs.
+
+    Numeric casting is performed on the operands to the inner multiply, promoting
+    them to the same data type as the accumulator/output.
+    """
+    domain(D.m, D.n, D.k)
+    implements(ContractionOpInterface)
+    C[D.m, D.n] += cast(U, A[D.m, D.k]) * cast(U, B[D.k, D.n])
+
+
 @linalg_structured_op
 def quantized_matmul(
     A=TensorDef(T1, S.M, S.K),
diff --git a/mlir/test/Dialect/Linalg/generalize-named-ops.mlir b/mlir/test/Dialect/Linalg/generalize-named-ops.mlir
index aba26c35931fd3..1e8f1435ca0fa5 100644
--- a/mlir/test/Dialect/Linalg/generalize-named-ops.mlir
+++ b/mlir/test/Dialect/Linalg/generalize-named-ops.mlir
@@ -29,34 +29,6 @@ func.func @generalize_matmul_buffer(%A : memref<16x8xf32>, %B: memref<8x32xf32>,
 
 // -----
 
-func.func @matmul_bcast_a(%arg0: memref<5xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) {
-  linalg.matmul indexing_maps = [
-                       affine_map<(d0, d1, d2) -> (d2)>,
-                       affine_map<(d0, d1, d2) -> (d2, d1)>,
-                       affine_map<(d0, d1, d2) -> (d0, d1)>
-                     ]
-                     ins(%arg0, %arg1 : memref<5xf32>, memref<5x7xf32>) outs(%arg2: memref<3x7xf32>)
-  return
-}
-
-// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2)>
-// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
-// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
-// CHECK-LABEL:   func.func @matmul_bcast_a(
-// CHECK-SAME:                              %[[VAL_0:.*]]: memref<5xf32>,
-// CHECK-SAME:                              %[[VAL_1:.*]]: memref<5x7xf32>,
-// CHECK-SAME:                              %[[VAL_2:.*]]: memref<3x7xf32>) {
-// CHECK:           linalg.generic {indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]], iterator_types = ["parallel", "parallel", "reduction"]} ins(%[[VAL_0]], %[[VAL_1]] : memref<5xf32>, memref<5x7xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) {
-// CHECK:           ^bb0(%[[VAL_3:.*]]: f32, %[[VAL_4:.*]]: f32, %[[VAL_5:.*]]: f32):
-// CHECK:             %[[VAL_6:.*]] = arith.mulf %[[VAL_3]], %[[VAL_4]] : f32
-// CHECK:             %[[VAL_7:.*]] = arith.addf %[[VAL_5]], %[[VAL_6]] : f32
-// CHECK:             linalg.yield %[[VAL_7]] : f32
-// CHECK:           }
-// CHECK:           return
-// CHECK:         }
-
-// -----
-
 func.func @generalize_matmul_tensor(%A : tensor<16x8xf32>, %B: tensor<8x32xf32>, %C: tensor<16x32xf32>) -> tensor<16x32xf32> {
   %0 = linalg.matmul ins(%A, %B: tensor<16x8xf32>, tensor<8x32xf32>)
                     outs(%C: tensor<16x32xf32>) -> tensor<16x32xf32>
@@ -919,86 +891,3 @@ func.func @fill_tensor(%f: f32, %v: vector<2x4xf32>) -> (tensor<f32>, tensor<vec
 
   return %0, %1: tensor<f32>, tensor<vector<2x4xf32>>
 }
-
-// -----
-
-// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2, d0)>
-// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
-// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
-
-// CHECK-LABEL:   func.func @matmul_transpose_a_explicit(
-// CHECK-SAME:                                  %[[VAL_0:.*]]: memref<5x3xf32>,
-// CHECK-SAME:                                  %[[VAL_1:.*]]: memref<5x7xf32>,
-// CHECK-SAME:                                  %[[VAL_2:.*]]: memref<3x7xf32>) {
-
-// CHECK:           linalg.generic {indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]], iterator_types = ["parallel", "parallel", "reduction"]}
-// CHECK:           arith.mulf
-// CHECK:           arith.addf
-
-func.func @matmul_transpose_a_explicit(%arg0: memref<5x3xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) {
-  linalg.matmul indexing_maps = [
-                       affine_map<(d0, d1, d2) -> (d2, d0)>,
-                       affine_map<(d0, d1, d2) -> (d2, d1)>,
-                       affine_map<(d0, d1, d2) -> (d0, d1)>
-                      ]
-                      ins(%arg0, %arg1 : memref<5x3xf32>, memref<5x7xf32>)
-                      outs(%arg2: memref<3x7xf32>)
-                      
-  return
-}
-
-// -----
-
-// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
-// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
-// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
-// CHECK-LABEL:   func.func @matmul_transpose_b_explicit(
-// CHECK-SAME:                                           %[[VAL_0:.*]]: memref<3x5xf32>,
-// CHECK-SAME:                                           %[[VAL_1:.*]]: memref<7x5xf32>,
-// CHECK-SAME:                                           %[[VAL_2:.*]]: memref<3x7xf32>) {
-
-// CHECK:           linalg.generic {indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]], iterator_types = ["parallel", "parallel", "reduction"]}
-// CHECK:           arith.mulf
-// CHECK:           arith.addf
-
-func.func @matmul_transpose_b_explicit(%arg0: memref<3x5xf32>, %arg1: memref<7x5xf32>, %arg2: memref<3x7xf32>) {
-  linalg.matmul indexing_maps = [
-                       affine_map<(d0, d1, d2) -> (d0, d2)>,
-                       affine_map<(d0, d1, d2) -> (d1, d2)>,
-                       affine_map<(d0, d1, d2) -> (d0, d1)>
-                      ]
-                      ins(%arg0, %arg1 : memref<3x5xf32>, memref<7x5xf32>)
-                      outs(%arg2: memref<3x7xf32>)
-                      
-  return
-}
-
-// -----
-
-// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2, d0)>
-// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
-// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
-
-// CHECK-LABEL:   func.func @matmul_transpose_a_b_explicit(
-// CHECK-SAME:                                             %[[VAL_0:.*]]: memref<5x3xf32>,
-// CHECK-SAME:                                             %[[VAL_1:.*]]: memref<7x5xf32>,
-// CHECK-SAME:                                             %[[VAL_2:.*]]: memref<3x7xf32>) {
-
-// CHECK:           linalg.generic {indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]], iterator_types = ["parallel", "parallel", "reduction"]}
-// CHECK:           arith.mulf
-// CHECK:           arith.addf
-
-func.func @matmul_transpose_a_b_explicit(%arg0: memref<5x3xf32>, %arg1: memref<7x5xf32>, %arg2: memref<3x7xf32>) {
-  linalg.matmul indexing_maps = [
-                       affine_map<(d0, d1, d2) -> (d2, d0)>,
-                       affine_map<(d0, d1, d2) -> (d1, d2)>,
-                       affine_map<(d0, d1, d2) -> (d0, d1)>
-                      ]
-                      ins(%arg0, %arg1 : memref<5x3xf32>, memref<7x5xf32>)
-                      outs(%arg2: memref<3x7xf32>)
-                      
-  return
-}
-
-// -----
-
diff --git a/mlir/test/Dialect/Linalg/invalid.mlir b/mlir/test/Dialect/Linalg/invalid.mlir
index b2869893b8042d..c481a723c5623c 100644
--- a/mlir/test/Dialect/Linalg/invalid.mlir
+++ b/mlir/test/Dialect/Linalg/invalid.mlir
@@ -361,165 +361,6 @@ func.func @invalid_static_matmul(%arg0: memref<2x4xf32>, %arg1: memref<3x4xf32>,
 
 // -----
 
-func.func @invalid_indexing_maps_matmul(%arg0: memref<2x4xf32>, %arg1: memref<3x4xf32>, %arg2: memref<2x4xf32>) {
-  // expected-error @+1 {{expected attribute value}}
-  linalg.matmul indexing_maps = [
-                       ,
-                       affine_map<(d0, d1, d2) -> (d2, d1)>,
-                       affine_map<(d0, d1, d2) -> (d0, d1)>
-                      ]
-                      ins(%arg0, %arg1 : memref<2x4xf32>, memref<3x4xf32>)
-                      outs(%arg2 :memref<2x4xf32>)
-  return
-}
-
-// -----
-
-func.func @invalid_matmul_dim_a(%arg0: memref<5x5xf32>, %arg1: memref<5x5xf32>, %arg2: memref<5x5xf32>) {
-  // expected-error @+1 {{Unexpected dim expression in map result}}
-  linalg.matmul indexing_maps = [
-                       affine_map<(d0, d1, d2) -> (d1, d2)>,
-                       affine_map<(d0, d1, d2) -> (d2, d1)>,
-                       affine_map<(d0, d1, d2) -> (d0, d1)>
-                     ]
-                     ins(%arg0, %arg1 : memref<5x5xf32>, memref<5x5xf32>) outs(%arg2: memref<5x5xf32>)
-  return
-}
-
-// -----
-
-func.func @invalid_matmul_dim_b(%arg0: memref<5x5xf32>, %arg1: memref<5x5xf32>, %arg2: memref<5x5xf32>) {
-  // expected-error @+1 {{Unexpected dim expression in map result}}
-  linalg.matmul indexing_maps = [
-                       affine_map<(d0, d1, d2) -> (d0, d2)>,
-                       affine_map<(d0, d1, d2) -> (d2, d0)>,
-                       affine_map<(d0, d1, d2) -> (d0, d1)>
-                     ]
-                     ins(%arg0, %arg1 : memref<5x5xf32>, memref<5x5xf32>) outs(%arg2: memref<5x5xf32>)
-  return
-}
-
-// -----
-
-func.func @invalid_transpose_a_matmul(%lhs: tensor<4x1xf32>, %rhs: tensor<1x64xf32>, %init: tensor<4x64xf32>) -> tensor<4x64xf32> {
-  // expected-error @+1 {{inferred input/output operand #1 has shape's dimension #0 to be 4, but found 1}}
-  %0 = linalg.matmul indexing_maps = [
-                       affine_map<(d0, d1, d2) -> (d2, d0)>,
-                       affine_map<(d0, d1, d2) -> (d2, d1)>,
-                       affine_map<(d0, d1, d2) -> (d0, d1)>
-                      ]
-                      ins(%lhs, %rhs : tensor<4x1xf32>, tensor<1x64xf32>)
-                      outs(%init : tensor<4x64xf32>) -> tensor<4x64xf32>
-  return %0: tensor<4x64xf32>
-}
-
-// -----
-
-func.func @invalid_transpose_b_matmul(%lhs: tensor<4x1xf32>, %rhs: tensor<1x64xf32>, %init: tensor<4x64xf32>) -> tensor<4x64xf32> {
-  // expected-error @+1 {{inferred input/output operand #1 has shape's dimension #1 to be 1, but found 64}}
-  %0 = linalg.matmul indexing_maps = [
-                       affine_map<(d0, d1, d2) -> (d0, d2)>,
-                       affine_map<(d0, d1, d2) -> (d1, d2)>,
-                       affine_map<(d0, d1, d2) -> (d0, d1)>
-                      ]
-                      ins(%lhs, %rhs : tensor<4x1xf32>, tensor<1x64xf32>)
-                      outs(%init : tensor<4x64xf32>) -> tensor<4x64xf32>
-  return %0: tensor<4x64xf32>
-}
-
-// -----
-
-func.func @invalid_bcast_a(%arg0: memref<3xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) {
-  // expected-error @+1 {{'linalg.matmul' op Invalid broadcast requested, should be (d2)}}
-  linalg.matmul indexing_maps = [
-                       affine_map<(d0, d1, d2) -> (d0)>,
-                       affine_map<(d0, d1, d2) -> (d1, d2)>,
-                       affine_map<(d0, d1, d2) -> (d0, d1)>
-                     ]
-                     ins(%arg0, %arg1 : memref<3xf32>, memref<5x7xf32>) outs(%arg2: memref<3x7xf32>)
-  return
-}
-
-// -----
-
-func.func @invalid_bcast_b(%arg0: memref<3x5xf32>, %arg1: memref<7xf32>, %arg2: memref<3x7xf32>) {
-  // expected-error @+1 {{'linalg.matmul' op Invalid broadcast requested, should be (d2)}}
-  linalg.matmul indexing_maps = [
-                       affine_map<(d0, d1, d2) -> (d0, d2)>,
-                       affine_map<(d0, d1, d2) -> (d1)>,
-                       affine_map<(d0, d1, d2) -> (d0, d1)>
-                     ]
-                     ins(%arg0, %arg1 : memref<3x5xf32>, memref<7xf32>) outs(%arg2: memref<3x7xf32>)
-  return
-}
-
-// -----
-
-func.func @invalid_bcast_a_rank_mismatch(%arg0: memref<3x5xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) {
-  // expected-error @+1 {{'linalg.matmul' op expected operand rank (2) to match the result rank of indexing_map #0 (1)}}
-  linalg.matmul indexing_maps = [
-                       affine_map<(d0, d1, d2) -> (d2)>,
-                       affine_map<(d0, d1, d2) -> (d2, d1)>,
-                       affine_map<(d0, d1, d2) -> (d0, d1)>
-                     ]
-                     ins(%arg0, %arg1 : memref<3x5xf32>, memref<5x7xf32>) outs(%arg2: memref<3x7xf32>)
-  return
-}
-
-// -----
-
-func.func @invalid_bcast_b_rank_mismatch(%arg0: memref<3x5xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) {
-  // expected-error @+1 {{'linalg.matmul' op expected operand rank (2) to match the result rank of indexing_map #1 (1)}}
-  linalg.matmul indexing_maps = [
-                       affine_map<(d0, d1, d2) -> (d0, d2)>,
-                       affine_map<(d0, d1, d2) -> (d2)>,
-                       affine_map<(d0, d1, d2) -> (d0, d1)>
-                     ]
-                     ins(%arg0, %arg1 : memref<3x5xf32>, memref<5x7xf32>) outs(%arg2: memref<3x7xf32>)
-  return
-}
-
-// -----
-
-func.func @invalid_matmul_bcast_b_transpose_a(%arg0: memref<5x3xf32>, %arg1: memref<7xf32>, %arg2: memref<3x7xf32>) {
-  // expected-error @+1 {{inferred input/output operand #1 has shape's dimension #0 to be 5, but found 7}}
-  linalg.matmul indexing_maps = [
-                       affine_map<(d0, d1, d2) -> (d2, d0)>,
-                       affine_map<(d0, d1, d2) -> (d2)>,
-                       affine_map<(d0, d1, d2) -> (d0, d1)>
-                     ]
-                     ins(%arg0, %arg1 : memref<5x3xf32>, memref<7xf32>) outs(%arg2: memref<3x7xf32>)
-  return
-}
-
-// -----
-
-func.func @invalid_matmul_bcast_b_transpose_a_wrong_dim(%arg0: memref<3x5xf32>, %arg1: memref<5xf32>, %arg2: memref<3x7xf32>) {
-  // expected-error @+1 {{'linalg.matmul' op Unexpected dim expression in map result.}}
-  linalg.matmul indexing_maps = [
-                       affine_map<(d0, d1, d2) -> (d1, d2)>,
-                       affine_map<(d0, d1, d2) -> (d2)>,
-                       affine_map<(d0, d1, d2) -> (d0, d1)>
-                     ]
-                     ins(%arg0, %arg1 : memref<3x5xf32>, memref<5xf32>) outs(%arg2: memref<3x7xf32>)
-  return
-}
-
-// -----
-
-func.func @invalid_indexing_maps_placement_matmul(%lhs: tensor<4x1xf32>, %rhs: tensor<1x64xf32>, %init: tensor<4x64xf32>) {
-  // expected-error @+2 {{custom op 'indexing_maps' is unknown (tried 'func.indexing_maps' as well)}}
-  linalg.matmul ins(%lhs, %rhs : tensor<4x1xf32>, tensor<1x64xf32>) outs(%init : tensor<4x64xf32>)
-                        indexing_maps = [
-                       affine_map<(d0, d1, d2) -> (d0, d2)>,
-                       affine_map<(d0, d1, d2) -> (d2, d1)>,
-                       affine_map<(d0, d1, d2) -> (d0, d1)>
-                      ]
-  return
-}
-
-// -----
-
 func.func @invalid_static_2d_conv(%input : memref<1x3x4x2xf32>, %filter: memref<3x2x2x1xf32>, %output: memref<1x2x3x1xf32>) {
   // expected-error @+1 {{inferred input/output operand #0 has shape's dimension #1 to be greater than or equal to 4, but found 3}}
   linalg.conv_2d_nhwc_hwcf
diff --git a/mlir/test/Dialect/Linalg/named-ops.mlir b/mlir/test/Dialect/Linalg/named-ops.mlir
index 65c18de8424771..02ecbed232c8b5 100644
--- a/mlir/test/Dialect/Linalg/named-ops.mlir
+++ b/mlir/test/Dialect/Linalg/named-ops.mlir
@@ -1201,249 +1201,6 @@ func.func @matmul_transpose_a(%arg0: memref<5x3xf32>, %arg1: memref<5x7xf32>, %a
 
 // -----
 
-// CHECK-LABEL: func @matmul_transpose_a_explicit
-//       CHECK:   linalg.matmul
-//  CHECK-SAME:     ins(%{{.+}}, %{{.+}} : memref<5x3xf32>, memref<5x7xf32>)
-//  CHECK-SAME:     outs(%{{.+}} : memref<3x7xf32>)
-func.func @matmul_transpose_a_explicit(%arg0: memref<5x3xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) {
-  linalg.matmul indexing_maps = [
-                       affine_map<(d0, d1, d2) -> (d2, d0)>,
-                       affine_map<(d0, d1, d2) -> (d2, d1)>,
-                       affine_map<(d0, d1, d2) -> (d0, d1)>
-                      ]
-                      ins(%arg0, %arg1 : memref<5x3xf32>, memref<5x7xf32>)
-                      outs(%arg2: memref<3x7xf32>)
-                      
-  return
-}
-
-// -----
-
-func.func @matmul_transpose_b_explicit(%arg0: memref<3x5xf32>, %arg1: memref<7x5xf32>, %arg2: memref<3x7xf32>) {
-  linalg.matmul indexing_maps = [
-                       affine_map<(d0, d1, d2) -> (d0, d2)>,
-                       affine_map<(d0, d1, d2) -> (d1, d2)>,
-                       affine_map<(d0, d1, d2) -> (d0, d1)>
-                      ]
-                      ins(%arg0, %arg1 : memref<3x5xf32>, memref<7x5xf32>)
-                      outs(%arg2: memref<3x7xf32>)
-                      
-  return
-}
-
-// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
-// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
-// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
-
-// CHECK-LABEL:   func.func @matmul_transpose_b_explicit(
-// CHECK-SAME:                                           %[[VAL_0:.*]]: memref<3x5xf32>,
-// CHECK-SAME:                                           %[[VAL_1:.*]]: memref<7x5xf32>,
-// CHECK-SAME:                                           %[[VAL_2:.*]]: memref<3x7xf32>) {
-// CHECK:           linalg.matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<3x5xf32>, memref<7x5xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]]
-// CHECK:           return
-// CHECK:         }
-
-// -----
-
-func.func @matmul_transpose_a_b_explicit(%arg0: memref<5x3xf32>, %arg1: memref<7x5xf32>, %arg2: memref<3x7xf32>) {
-  linalg.matmul indexing_maps = [
-                       affine_map<(d0, d1, d2) -> (d2, d0)>,
-                       affine_map<(d0, d1, d2) -> (d1, d2)>,
-                       affine_map<(d0, d1, d2) -> (d0, d1)>
-                      ]
-                      ins(%arg0, %arg1 : memref<5x3xf32>, memref<7x5xf32>)
-                      outs(%arg2: memref<3x7xf32>)
-  return
-}
-
-// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2, d0)>
-// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
-// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
-
-// CHECK-LABEL:   func.func @matmul_transpose_a_b_explicit(
-// CHECK-SAME:                                             %[[VAL_0:.*]]: memref<5x3xf32>,
-// CHECK-SAME:                                             %[[VAL_1:.*]]: memref<7x5xf32>,
-// CHECK-SAME:                                             %[[VAL_2:.*]]: memref<3x7xf32>) {
-// CHECK:           linalg.matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<5x3xf32>, memref<7x5xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]]
-// CHECK:           return
-// CHECK:         }
-
-// -----
-
-func.func @matmul_bcast_a(%arg0: memref<5xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) {
-  linalg.matmul indexing_maps = [
-                       affine_map<(d0, d1, d2) -> (d2)>,
-                       affine_map<(d0, d1, d2) -> (d2, d1)>,
-                       affine_map<(d0, d1, d2) -> (d0, d1)>
-                     ]
-                     ins(%arg0, %arg1 : memref<5xf32>, memref<5x7xf32>) outs(%arg2: memref<3x7xf32>)
-  return
-}
-
-// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2)>
-// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
-// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
-// CHECK-LABEL: func @matmul_bcast_a
-//       CHECK:   linalg.matmul
-//  CHECK-SAME:     ins(%{{.+}}, %{{.+}} : memref<5xf32>, memref<5x7xf32>)
-//  CHECK-SAME:     outs(%{{.+}} : memref<3x7xf32>)
-
-// -----
-
-func.func @matmul_bcast_a_dim1(%arg0: memref<5xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) {
-  linalg.matmul indexing_maps = [
-                       affine_map<(d0, d1, d2) -> (d2)>,
-                       affine_map<(d0, d1, d2) -> (d2, d1)>,
-                       affine_map<(d0, d1, d2) -> (d0, d1)>
-                     ]
-                     ins(%arg0, %arg1 : memref<5xf32>, memref<5x7xf32>) outs(%arg2: memref<3x7xf32>)
-  return
-}
-
-// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2)>
-// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
-// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
-// CHECK-LABEL: func @matmul_bcast_a_dim1
-//       CHECK:   linalg.matmul
-//  CHECK-SAME:     ins(%{{.+}}, %{{.+}} : memref<5xf32>, memref<5x7xf32>)
-//  CHECK-SAME:     outs(%{{.+}} : memref<3x7xf32>)
-
-// -----
-
-func.func @matmul_bcast_b(%arg0: memref<3x5xf32>, %arg1: memref<5xf32>, %arg2: memref<3x7xf32>) {
-  linalg.matmul indexing_maps = [
-                       affine_map<(d0, d1, d2) -> (d0, d2)>,
-                       affine_map<(d0, d1, d2) -> (d2)>,
-                       affine_map<(d0, d1, d2) -> (d0, d1)>
-                     ]
-                     ins(%arg0, %arg1 : memref<3x5xf32>, memref<5xf32>) outs(%arg2: memref<3x7xf32>)
-  return
-}
-
-// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
-// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2)>
-// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
-// CHECK-LABEL: func @matmul_bcast_b
-//       CHECK:   linalg.matmul
-//  CHECK-SAME:     ins(%{{.+}}, %{{.+}} : memref<3x5xf32>, memref<5xf32>)
-//  CHECK-SAME:     outs(%{{.+}} : memref<3x7xf32>)
-
-// -----
-
-func.func @matmul_bcast_a_b(%arg0: memref<5xf32>, %arg1: memref<5xf32>, %arg2: memref<3x7xf32>) {
-  linalg.matmul indexing_maps = [
-                       affine_map<(d0, d1, d2) -> (d2)>,
-                       affine_map<(d0, d1, d2) -> (d2)>,
-                       affine_map<(d0, d1, d2) -> (d0, d1)>
-                     ]
-                     ins(%arg0, %arg1 : memref<5xf32>, memref<5xf32>) outs(%arg2: memref<3x7xf32>)
-  return
-}
-
-// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2)>
-// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
-
-// CHECK-LABEL:   func.func @matmul_bcast_a_b(
-// CHECK-SAME:                                %[[VAL_0:.*]]: memref<5xf32>, %[[VAL_1:.*]]: memref<5xf32>,
-// CHECK-SAME:                                %[[VAL_2:.*]]: memref<3x7xf32>) {
-// CHECK:           linalg.matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<5xf32>, memref<5xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_0]], #[[$ATTR_1]]]
-// CHECK:           return
-// CHECK:         }
-
-// -----
-
-func.func @matmul_bcast_b_dim1(%arg0: memref<3x5xf32>, %arg1: memref<5xf32>, %arg2: memref<3x7xf32>) {
-  linalg.matmul indexing_maps = [
-                       affine_map<(d0, d1, d2) -> (d0, d2)>,
-                       affine_map<(d0, d1, d2) -> (d2)>,
-                       affine_map<(d0, d1, d2) -> (d0, d1)>
-                     ]
-                     ins(%arg0, %arg1 : memref<3x5xf32>, memref<5xf32>) outs(%arg2: memref<3x7xf32>)
-  return
-}
-
-// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
-// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2)>
-// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
-// CHECK-LABEL: func @matmul_bcast_b_dim1
-//       CHECK:   linalg.matmul
-//  CHECK-SAME:     ins(%{{.+}}, %{{.+}} : memref<3x5xf32>, memref<5xf32>)
-//  CHECK-SAME:     outs(%{{.+}} : memref<3x7xf32>)
-
-// -----
-
-func.func @dynamic_matmul_bcast_a(%arg0: memref<?xf32>, %arg1: memref<?x?xf32>, %arg2: memref<?x?xf32>) {
-  linalg.matmul indexing_maps = [
-                       affine_map<(d0, d1, d2) -> (d2)>,
-                       affine_map<(d0, d1, d2) -> (d2, d1)>,
-                       affine_map<(d0, d1, d2) -> (d0, d1)>
-                     ]
-                     ins(%arg0, %arg1 : memref<?xf32>, memref<?x?xf32>) outs(%arg2: memref<?x?xf32>)
-  return
-}
-
-// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2)>
-// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
-// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
-
-// CHECK-LABEL:   func.func @dynamic_matmul_bcast_a(
-// CHECK-SAME:                                      %[[VAL_0:.*]]: memref<?xf32>,
-// CHECK-SAME:                                      %[[VAL_1:.*]]: memref<?x?xf32>,
-// CHECK-SAME:                                      %[[VAL_2:.*]]: memref<?x?xf32>) {
-// CHECK:           linalg.matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<?xf32>, memref<?x?xf32>) outs(%[[VAL_2]] : memref<?x?xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]]
-// CHECK:           return
-// CHECK:         }
-
-// -----
-
-func.func @matmul_bcast_a_transpose_b(%arg0: memref<5xf32>, %arg1: memref<7x5xf32>, %arg2: memref<3x7xf32>) {
-  linalg.matmul indexing_maps = [
-                       affine_map<(d0, d1, d2) -> (d2)>,
-                       affine_map<(d0, d1, d2) -> (d1, d2)>,
-                       affine_map<(d0, d1, d2) -> (d0, d1)>
-                     ]
-                     ins(%arg0, %arg1 : memref<5xf32>, memref<7x5xf32>) outs(%arg2: memref<3x7xf32>)
-  return
-}
-
-// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2)>
-// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
-// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
-
-// CHECK-LABEL:   func.func @matmul_bcast_a_transpose_b(
-// CHECK-SAME:                                  %[[VAL_0:.*]]: memref<5xf32>,
-// CHECK-SAME:                                  %[[VAL_1:.*]]: memref<7x5xf32>,
-// CHECK-SAME:                                  %[[VAL_2:.*]]: memref<3x7xf32>) {
-// CHECK:           linalg.matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<5xf32>, memref<7x5xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]]
-// CHECK:           return
-// CHECK:         }
-
-// -----
-
-func.func @matmul_bcast_b_transpose_a(%arg0: memref<5x3xf32>, %arg1: memref<5xf32>, %arg2: memref<3x7xf32>) {
-  linalg.matmul indexing_maps = [
-                       affine_map<(d0, d1, d2) -> (d2, d0)>,
-                       affine_map<(d0, d1, d2) -> (d2)>,
-                       affine_map<(d0, d1, d2) -> (d0, d1)>
-                     ]
-                     ins(%arg0, %arg1 : memref<5x3xf32>, memref<5xf32>) outs(%arg2: memref<3x7xf32>)
-  return
-}
-
-// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2, d0)>
-// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2)>
-// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
-
-// CHECK-LABEL:   func.func @matmul_bcast_b_transpose_a(
-// CHECK-SAME:                                          %[[VAL_0:.*]]: memref<5x3xf32>,
-// CHECK-SAME:                                          %[[VAL_1:.*]]: memref<5xf32>,
-// CHECK-SAME:                                          %[[VAL_2:.*]]: memref<3x7xf32>) {
-// CHECK:           linalg.matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<5x3xf32>, memref<5xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]]
-// CHECK:           return
-// CHECK:         }
-
-// -----
-
 // CHECK-LABEL: func @matmul_transpose_b
 //       CHECK:   linalg.matmul_transpose_b
 //  CHECK-SAME:     ins(%{{.+}}, %{{.+}} : memref<3x5xf32>, memref<7x5xf32>)
diff --git a/mlir/test/python/dialects/linalg/ops.py b/mlir/test/python/dialects/linalg/ops.py
index 72045a07b2da80..3bfbcf7d7f7c81 100644
--- a/mlir/test/python/dialects/linalg/ops.py
+++ b/mlir/test/python/dialects/linalg/ops.py
@@ -84,6 +84,81 @@ def named_form(lhs, rhs):
 
     print(module)
 
+
+# CHECK-LABEL: TEST: testNamedStructuredOpGenericForm
+@run
+def testNamedStructuredOpGenericForm():
+    with Context() as ctx, Location.unknown():
+        module = Module.create()
+        f32 = F32Type.get()
+        with InsertionPoint(module.body):
+
+            @func.FuncOp.from_py_func(
+                RankedTensorType.get((4, 16), f32), RankedTensorType.get((16, 8), f32)
+            )
+            def named_form(lhs, rhs):
+                init_result = tensor.empty([4, 8], f32)
+                #      CHECK: "linalg.matmul"(%{{.*}})
+                # CHECK-SAME:    cast = #linalg.type_fn<cast_signed>
+                # CHECK-SAME:    operandSegmentSizes = array<i32: 2, 1>
+                # CHECK-NEXT:  ^bb0(%{{.*}}: f32, %{{.*}}: f32, %{{.*}}: f32):
+                # CHECK-NEXT:    arith.mulf{{.*}} (f32, f32) -> f32
+                # CHECK-NEXT:    arith.addf{{.*}} (f32, f32) -> f32
+                # CHECK-NEXT:    linalg.yield{{.*}} (f32) -> ()
+                # CHECK-NEXT: (tensor<4x16xf32>, tensor<16x8xf32>, tensor<4x8xf32>) -> tensor<4x8xf32>
+                return linalg.matmul(lhs, rhs, outs=[init_result])
+
+    module.operation.print(print_generic_op_form=True)
+
+
+# CHECK-LABEL: TEST: testNamedStructuredAsGenericOp
+@run
+def testNamedStructuredAsGenericOp():
+    with Context() as ctx, Location.unknown():
+        module = Module.create()
+        f32 = F32Type.get()
+        with InsertionPoint(module.body):
+
+            @func.FuncOp.from_py_func(
+                RankedTensorType.get((4, 16), f32), RankedTensorType.get((16, 8), f32)
+            )
+            def generic_form(lhs, rhs):
+                init_result = tensor.EmptyOp([4, 8], f32)
+                # CHECK: linalg.generic
+                return linalg.matmul(
+                    lhs, rhs, outs=[init_result.result], emit_generic=True
+                )
+
+    print(module)
+
+
+# CHECK-LABEL: TEST: testOpResultFromOtherOp
+@run
+def testOpResultFromOtherOp():
+    with Context(), Location.unknown():
+        module = Module.create()
+        f32 = F32Type.get()
+        with InsertionPoint(module.body):
+
+            @func.FuncOp.from_py_func(
+                RankedTensorType.get((4, 16), f32), RankedTensorType.get((16, 8), f32)
+            )
+            def pass_an_op_directly(arg0, arg1):
+                one = arith.ConstantOp(F32Type.get(), 1.0)
+                # CHECK: %[[LHS:.*]] = linalg.fill
+                lhs = linalg.fill(one, outs=[arg0])
+                # CHECK: %[[RHS:.*]] = linalg.fill
+                rhs = linalg.fill(one, outs=[arg1])
+                # CHECK: %[[INIT:.*]] = tensor.empty
+                init = tensor.EmptyOp([4, 8], f32)
+                # CHECK: linalg.matmul
+                # CHECK: ins(%[[LHS]], %[[RHS]]
+                # CHECK: outs(%[[INIT]]
+                return linalg.matmul(lhs, rhs, outs=init)
+
+    print(module)
+
+
 # CHECK-LABEL: TEST: testIdentityRegionOps
 @run
 def testIdentityRegionOps():
diff --git a/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp b/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp
index f820cb7ee8c3c4..aa5a52a21f1251 100644
--- a/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp
+++ b/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp
@@ -681,11 +681,7 @@ ParseResult {0}::parse(OpAsmParser &parser, OperationState &result) {{
     {0}::getNumRegionArgs(), {0}::getRegionBuilder());
 }
 void {0}::print(OpAsmPrinter &p) {{
-  SmallVector<StringRef, 3> elidedAttrs = {{"operandSegmentSizes",
-                                           "linalg.memoized_indexing_maps",
-                                           "indexing_maps"};
-  ::printNamedStructuredOp(p, getOperation(), getInputs(), getOutputs(),
-                           elidedAttrs);
+  ::printNamedStructuredOp(p, getOperation(), getInputs(), getOutputs());
 }
 )FMT";
 

From a4402039bffd788b9af82435fd5a2fb311fdc6e8 Mon Sep 17 00:00:00 2001
From: Sebastian Kreutzer <SebastianKreutzer@gmx.net>
Date: Fri, 11 Oct 2024 05:23:34 -0400
Subject: [PATCH 143/177] [XRay] Add support for instrumentation of DSOs on
 x86_64 (#90959)

This PR introduces shared library (DSO) support for XRay based on a
revised version of the implementation outlined in [this
RFC](https://discourse.llvm.org/t/rfc-upstreaming-dso-instrumentation-support-for-xray/73000).
The feature enables the patching and handling of events from DSOs,
supporting both libraries linked at startup or explicitly loaded, e.g.
via `dlopen`.
This patch adds the following:
- The `-fxray-shared` flag to enable the feature (turned off by default)
- A small runtime library that is linked into every instrumented DSO,
providing position-independent trampolines and code to register with the
main XRay runtime
- Changes to the XRay runtime to support management and patching of
multiple objects

These changes are fully backward compatible, i.e. running without
instrumented DSOs will produce identical traces (in terms of recorded
function IDs) to the previous implementation.

Due to my limited ability to test on other architectures, this feature
is only implemented and tested with x86_64. Extending support to other
architectures is fairly straightforward, requiring only a
position-independent implementation of the architecture-specific
trampoline implementation (see
`compiler-rt/lib/xray/xray_trampoline_x86_64.S` for reference).

This patch does not include any functionality to resolve function IDs
from DSOs for the provided logging/tracing modes. These modes still work
and will record calls from DSOs, but symbol resolution for these
functions in not available. Getting this to work properly requires
recording information about the loaded DSOs and should IMO be discussed
in a separate RFC, as there are mulitple feasible approaches.

@petrhosek @jplehr
---
 clang/include/clang/Basic/CodeGenOptions.def  |   2 +
 clang/include/clang/Driver/Options.td         |   5 +
 clang/include/clang/Driver/XRayArgs.h         |   4 +
 clang/lib/Driver/ToolChains/CommonArgs.cpp    |  12 +-
 clang/lib/Driver/XRayArgs.cpp                 |  21 ++
 clang/test/Driver/XRay/xray-shared.cpp        |  17 +
 .../cmake/Modules/AllSupportedArchDefs.cmake  |   1 +
 compiler-rt/cmake/config-ix.cmake             |   4 +
 compiler-rt/include/xray/xray_interface.h     |  55 +++-
 compiler-rt/lib/xray/CMakeLists.txt           |  86 +++++-
 compiler-rt/lib/xray/xray_dso_init.cpp        |  62 ++++
 compiler-rt/lib/xray/xray_init.cpp            | 183 +++++++++--
 compiler-rt/lib/xray/xray_interface.cpp       | 292 ++++++++++++++----
 .../lib/xray/xray_interface_internal.h        |  83 ++++-
 compiler-rt/lib/xray/xray_trampoline_x86_64.S |  24 +-
 compiler-rt/lib/xray/xray_x86_64.cpp          |  23 +-
 .../xray/TestCases/Posix/basic-mode-dso.cpp   |  47 +++
 .../TestCases/Posix/clang-xray-shared.cpp     |  14 +
 .../test/xray/TestCases/Posix/dlopen.cpp      | 107 +++++++
 .../xray/TestCases/Posix/dso-dep-chains.cpp   | 197 ++++++++++++
 .../TestCases/Posix/patch-premain-dso.cpp     |  45 +++
 .../Posix/patching-unpatching-dso.cpp         |  75 +++++
 22 files changed, 1215 insertions(+), 144 deletions(-)
 create mode 100644 clang/test/Driver/XRay/xray-shared.cpp
 create mode 100644 compiler-rt/lib/xray/xray_dso_init.cpp
 create mode 100644 compiler-rt/test/xray/TestCases/Posix/basic-mode-dso.cpp
 create mode 100644 compiler-rt/test/xray/TestCases/Posix/clang-xray-shared.cpp
 create mode 100644 compiler-rt/test/xray/TestCases/Posix/dlopen.cpp
 create mode 100644 compiler-rt/test/xray/TestCases/Posix/dso-dep-chains.cpp
 create mode 100644 compiler-rt/test/xray/TestCases/Posix/patch-premain-dso.cpp
 create mode 100644 compiler-rt/test/xray/TestCases/Posix/patching-unpatching-dso.cpp

diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index eac831278ee20d..e45370bde74a5d 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -136,6 +136,8 @@ CODEGENOPT(XRayIgnoreLoops , 1, 0)
 ///< Emit the XRay function index section.
 CODEGENOPT(XRayFunctionIndex , 1, 1)
 
+///< Set when -fxray-shared is enabled
+CODEGENOPT(XRayShared , 1, 0)
 
 ///< Set the minimum number of instructions in a function to determine selective
 ///< XRay instrumentation.
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index d306c751505e98..4ee16e213d0e13 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2946,6 +2946,11 @@ def fxray_selected_function_group :
   HelpText<"When using -fxray-function-groups, select which group of functions to instrument. Valid range is 0 to fxray-function-groups - 1">,
   MarshallingInfoInt<CodeGenOpts<"XRaySelectedFunctionGroup">, "0">;
 
+defm xray_shared : BoolFOption<"xray-shared",
+  CodeGenOpts<"XRayShared">, DefaultFalse,
+  PosFlag<SetTrue, [], [ClangOption, CC1Option],
+          "Enable shared library instrumentation with XRay">,
+  NegFlag<SetFalse>>;
 
 defm fine_grained_bitfield_accesses : BoolOption<"f", "fine-grained-bitfield-accesses",
   CodeGenOpts<"FineGrainedBitfieldAccesses">, DefaultFalse,
diff --git a/clang/include/clang/Driver/XRayArgs.h b/clang/include/clang/Driver/XRayArgs.h
index bdd3d979547eed..8fbcf469e5bad1 100644
--- a/clang/include/clang/Driver/XRayArgs.h
+++ b/clang/include/clang/Driver/XRayArgs.h
@@ -27,6 +27,7 @@ class XRayArgs {
   XRayInstrSet InstrumentationBundle;
   llvm::opt::Arg *XRayInstrument = nullptr;
   bool XRayRT = true;
+  bool XRayShared = false;
 
 public:
   /// Parses the XRay arguments from an argument list.
@@ -35,6 +36,9 @@ class XRayArgs {
                llvm::opt::ArgStringList &CmdArgs, types::ID InputType) const;
 
   bool needsXRayRt() const { return XRayInstrument && XRayRT; }
+  bool needsXRayDSORt() const {
+    return XRayInstrument && XRayRT && XRayShared;
+  }
   llvm::ArrayRef<std::string> modeList() const { return Modes; }
   XRayInstrSet instrumentationBundle() const { return InstrumentationBundle; }
 };
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 0c6a585c3acffd..0a1b7c209563e8 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -1613,10 +1613,14 @@ bool tools::addSanitizerRuntimes(const ToolChain &TC, const ArgList &Args,
 }
 
 bool tools::addXRayRuntime(const ToolChain&TC, const ArgList &Args, ArgStringList &CmdArgs) {
-  if (Args.hasArg(options::OPT_shared))
-    return false;
-
-  if (TC.getXRayArgs().needsXRayRt()) {
+  if (Args.hasArg(options::OPT_shared)) {
+    if (TC.getXRayArgs().needsXRayDSORt()) {
+      CmdArgs.push_back("--whole-archive");
+      CmdArgs.push_back(TC.getCompilerRTArgString(Args, "xray-dso"));
+      CmdArgs.push_back("--no-whole-archive");
+      return true;
+    }
+  } else if (TC.getXRayArgs().needsXRayRt()) {
     CmdArgs.push_back("--whole-archive");
     CmdArgs.push_back(TC.getCompilerRTArgString(Args, "xray"));
     for (const auto &Mode : TC.getXRayArgs().modeList())
diff --git a/clang/lib/Driver/XRayArgs.cpp b/clang/lib/Driver/XRayArgs.cpp
index 8c5134e2501358..411054e067cb42 100644
--- a/clang/lib/Driver/XRayArgs.cpp
+++ b/clang/lib/Driver/XRayArgs.cpp
@@ -63,6 +63,23 @@ XRayArgs::XRayArgs(const ToolChain &TC, const ArgList &Args) {
         << XRayInstrument->getSpelling() << Triple.str();
   }
 
+  if (Args.hasFlag(options::OPT_fxray_shared,
+                   options::OPT_fno_xray_shared, false)) {
+    XRayShared = true;
+
+    // DSO instrumentation is currently limited to x86_64
+    if (Triple.getArch() != llvm::Triple::x86_64) {
+      D.Diag(diag::err_drv_unsupported_opt_for_target)
+          << "-fxray-shared" << Triple.str();
+    }
+
+    unsigned PICLvl = std::get<1>(tools::ParsePICArgs(TC, Args));
+    if (!PICLvl) {
+      D.Diag(diag::err_opt_not_valid_without_opt)
+          << "-fxray-shared" << "-fPIC";
+    }
+  }
+
   // Both XRay and -fpatchable-function-entry use
   // TargetOpcode::PATCHABLE_FUNCTION_ENTER.
   if (Arg *A = Args.getLastArg(options::OPT_fpatchable_function_entry_EQ))
@@ -177,6 +194,10 @@ void XRayArgs::addArgs(const ToolChain &TC, const ArgList &Args,
   Args.addOptOutFlag(CmdArgs, options::OPT_fxray_function_index,
                      options::OPT_fno_xray_function_index);
 
+  if (XRayShared)
+    Args.addOptInFlag(CmdArgs, options::OPT_fxray_shared,
+                      options::OPT_fno_xray_shared);
+
   if (const Arg *A =
           Args.getLastArg(options::OPT_fxray_instruction_threshold_EQ)) {
     int Value;
diff --git a/clang/test/Driver/XRay/xray-shared.cpp b/clang/test/Driver/XRay/xray-shared.cpp
new file mode 100644
index 00000000000000..215854e1fc7cef
--- /dev/null
+++ b/clang/test/Driver/XRay/xray-shared.cpp
@@ -0,0 +1,17 @@
+// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fPIC -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s
+// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fpic -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s
+// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s
+// RUN: not %clang -### --target=x86_64-unknown-linux-gnu -fno-PIC -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR-PIC
+// RUN: not %clang -### --target=x86_64-unknown-linux-gnu -fno-pic -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR-PIC
+
+// On 64 bit darwin, PIC is always enabled
+// RUN: %clang -### --target=x86_64-apple-darwin -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s
+
+// Check unsupported targets
+// RUN: not %clang -### --target=aarch64-pc-freebsd -fPIC -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR-TARGET
+// RUN: not %clang -### --target=arm64-apple-macos -fPIC -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR-TARGET
+
+// CHECK: "-cc1" {{.*}}"-fxray-instrument" {{.*}}"-fxray-shared"
+// ERR-TARGET:   error: unsupported option '-fxray-shared' for target
+// ERR-PIC:   error: option '-fxray-shared' cannot be specified without '-fPIC'
+
diff --git a/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake b/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
index 809e9277156912..50a4256b82fe4e 100644
--- a/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
+++ b/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
@@ -104,6 +104,7 @@ else()
 set(ALL_XRAY_SUPPORTED_ARCH ${X86_64} ${ARM32} ${ARM64} ${MIPS32} ${MIPS64}
 		powerpc64le ${HEXAGON} ${LOONGARCH64})
 endif()
+set(ALL_XRAY_DSO_SUPPORTED_ARCH ${X86_64})
 set(ALL_SHADOWCALLSTACK_SUPPORTED_ARCH ${ARM64})
 
 if (UNIX)
diff --git a/compiler-rt/cmake/config-ix.cmake b/compiler-rt/cmake/config-ix.cmake
index a93a88a9205001..6134c9876b38e9 100644
--- a/compiler-rt/cmake/config-ix.cmake
+++ b/compiler-rt/cmake/config-ix.cmake
@@ -668,6 +668,9 @@ if(APPLE)
   list_intersect(XRAY_SUPPORTED_ARCH
     ALL_XRAY_SUPPORTED_ARCH
     SANITIZER_COMMON_SUPPORTED_ARCH)
+  list_intersect(XRAY_DSO_SUPPORTED_ARCH
+    ALL_XRAY_DSO_SUPPORTED_ARCH
+    SANITIZER_COMMON_SUPPORTED_ARCH)
   list_intersect(SHADOWCALLSTACK_SUPPORTED_ARCH
     ALL_SHADOWCALLSTACK_SUPPORTED_ARCH
     SANITIZER_COMMON_SUPPORTED_ARCH)
@@ -702,6 +705,7 @@ else()
   filter_available_targets(CFI_SUPPORTED_ARCH ${ALL_CFI_SUPPORTED_ARCH})
   filter_available_targets(SCUDO_STANDALONE_SUPPORTED_ARCH ${ALL_SCUDO_STANDALONE_SUPPORTED_ARCH})
   filter_available_targets(XRAY_SUPPORTED_ARCH ${ALL_XRAY_SUPPORTED_ARCH})
+  filter_available_targets(XRAY_DSO_SUPPORTED_ARCH ${ALL_XRAY_DSO_SUPPORTED_ARCH})
   filter_available_targets(SHADOWCALLSTACK_SUPPORTED_ARCH
     ${ALL_SHADOWCALLSTACK_SUPPORTED_ARCH})
   filter_available_targets(GWP_ASAN_SUPPORTED_ARCH ${ALL_GWP_ASAN_SUPPORTED_ARCH})
diff --git a/compiler-rt/include/xray/xray_interface.h b/compiler-rt/include/xray/xray_interface.h
index 727431c04e4f73..717cfe292ce416 100644
--- a/compiler-rt/include/xray/xray_interface.h
+++ b/compiler-rt/include/xray/xray_interface.h
@@ -93,31 +93,74 @@ enum XRayPatchingStatus {
   FAILED = 3,
 };
 
-/// This tells XRay to patch the instrumentation points. See XRayPatchingStatus
+/// This tells XRay to patch the instrumentation points in all currently loaded objects. See XRayPatchingStatus
 /// for possible result values.
 extern XRayPatchingStatus __xray_patch();
 
+/// This tells XRay to patch the instrumentation points in the given object.
+/// See XRayPatchingStatus for possible result values.
+extern XRayPatchingStatus __xray_patch_object(int32_t ObjId);
+
 /// Reverses the effect of __xray_patch(). See XRayPatchingStatus for possible
 /// result values.
 extern XRayPatchingStatus __xray_unpatch();
 
-/// This patches a specific function id. See XRayPatchingStatus for possible
+/// Reverses the effect of __xray_patch_object. See XRayPatchingStatus for possible
+/// result values.
+extern XRayPatchingStatus __xray_unpatch_object(int32_t ObjId);
+
+/// This unpacks the given (packed) function id and patches
+/// the corresponding function.  See XRayPatchingStatus for possible
 /// result values.
 extern XRayPatchingStatus __xray_patch_function(int32_t FuncId);
 
-/// This unpatches a specific function id. See XRayPatchingStatus for possible
+/// This patches a specific function in the given object. See XRayPatchingStatus for possible
+/// result values.
+extern XRayPatchingStatus __xray_patch_function_in_object(int32_t FuncId,
+                                                          int32_t ObjId);
+
+/// This unpacks the given (packed) function id and unpatches
+/// the corresponding function. See XRayPatchingStatus for possible
 /// result values.
 extern XRayPatchingStatus __xray_unpatch_function(int32_t FuncId);
 
-/// This function returns the address of the function provided a valid function
-/// id. We return 0 if we encounter any error, even if 0 may be a valid function
+/// This unpatches a specific function in the given object.
+/// See XRayPatchingStatus for possible result values.
+extern XRayPatchingStatus __xray_unpatch_function_in_object(int32_t FuncId,
+                                                            int32_t ObjId);
+
+/// This function unpacks the given (packed) function id and returns the address of the corresponding function. We return 0 if we encounter any error, even if 0 may be a valid function
 /// address.
 extern uintptr_t __xray_function_address(int32_t FuncId);
 
-/// This function returns the maximum valid function id. Returns 0 if we
+/// This function returns the address of the function in the given object provided valid function and object
+/// ids. We return 0 if we encounter any error, even if 0 may be a valid function
+/// address.
+extern uintptr_t __xray_function_address_in_object(int32_t FuncId,
+                                                   int32_t ObjId);
+
+/// This function returns the maximum valid function id for the main executable (object id = 0). Returns 0 if we
 /// encounter errors (when there are no instrumented functions, etc.).
 extern size_t __xray_max_function_id();
 
+/// This function returns the maximum valid function id for the given object. Returns 0 if we
+/// encounter errors (when there are no instrumented functions, etc.).
+extern size_t __xray_max_function_id_in_object(int32_t ObjId);
+
+/// This function returns the number of previously registered objects (executable + loaded DSOs).
+/// Returns 0 if XRay has not been initialized.
+extern size_t __xray_num_objects();
+
+/// Unpacks the function id from the given packed id.
+extern int32_t __xray_unpack_function_id(int32_t PackedId);
+
+/// Unpacks the object id from the given packed id.
+extern int32_t __xray_unpack_object_id(int32_t PackedId);
+
+/// Creates and returns a packed id from the given function and object ids.
+/// If the ids do not fit within the reserved number of bits for each part, the high bits are truncated.
+extern int32_t __xray_pack_id(int32_t FuncId, int32_t ObjId);
+
 /// Initialize the required XRay data structures. This is useful in cases where
 /// users want to control precisely when the XRay instrumentation data
 /// structures are initialized, for example when the XRay library is built with
diff --git a/compiler-rt/lib/xray/CMakeLists.txt b/compiler-rt/lib/xray/CMakeLists.txt
index cf7b5062aae32d..f38c07420c9abf 100644
--- a/compiler-rt/lib/xray/CMakeLists.txt
+++ b/compiler-rt/lib/xray/CMakeLists.txt
@@ -10,6 +10,10 @@ set(XRAY_SOURCES
   xray_utils.cpp
   )
 
+set(XRAY_DSO_SOURCES
+  xray_dso_init.cpp
+  )
+
 # Implementation files for all XRay modes.
 set(XRAY_FDR_MODE_SOURCES
   xray_fdr_flags.cpp
@@ -33,6 +37,11 @@ set(x86_64_SOURCES
   xray_trampoline_x86_64.S
   )
 
+set(x86_64_DSO_SOURCES
+   xray_trampoline_x86_64.S
+   )
+
+
 set(arm_SOURCES
   xray_arm.cpp
   xray_trampoline_arm.S
@@ -128,10 +137,12 @@ set(XRAY_IMPL_HEADERS
 # consumption by tests.
 set(XRAY_ALL_SOURCE_FILES
   ${XRAY_SOURCES}
+  ${XRAY_DSO_SOURCES}
   ${XRAY_FDR_MODE_SOURCES}
   ${XRAY_BASIC_MODE_SOURCES}
   ${XRAY_PROFILING_MODE_SOURCES}
   ${x86_64_SOURCES}
+  ${x86_64_DSO_SOURCES}
   ${arm_SOURCES}
   ${armhf_SOURCES}
   ${hexagon_SOURCES}
@@ -162,6 +173,9 @@ set(XRAY_CFLAGS
   ${COMPILER_RT_CXX_CFLAGS})
 set(XRAY_COMMON_DEFINITIONS SANITIZER_COMMON_NO_REDEFINE_BUILTINS XRAY_HAS_EXCEPTIONS=1)
 
+# DSO trampolines need to be compiled with GOT addressing
+set(XRAY_COMMON_DEFINITIONS_DSO ${XRAY_COMMON_DEFINITIONS} XRAY_PIC)
+
 # Too many existing bugs, needs cleanup.
 append_list_if(COMPILER_RT_HAS_WNO_FORMAT -Wno-format XRAY_CFLAGS)
 
@@ -201,7 +215,16 @@ if (APPLE)
     CFLAGS ${XRAY_CFLAGS}
     DEFS ${XRAY_COMMON_DEFINITIONS}
     DEPS ${XRAY_DEPS})
+  add_compiler_rt_object_libraries(RTXrayDSO
+    OS ${XRAY_SUPPORTED_OS}
+    ARCHS ${XRAY_DSO_SUPPORTED_ARCH}
+    SOURCES ${XRAY_DSO_SOURCES}
+    ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS}
+    CFLAGS ${XRAY_CFLAGS}
+    DEFS ${XRAY_COMMON_DEFINITIONS_DSO}
+    DEPS ${XRAY_DEPS})
   set(XRAY_RTXRAY_ARCH_LIBS "")
+  set(XRAY_DSO_RTXRAY_ARCH_LIBS "")
   foreach(arch ${XRAY_SUPPORTED_ARCH})
     if(NOT ${arch} IN_LIST XRAY_SOURCE_ARCHS)
       continue()
@@ -215,6 +238,17 @@ if (APPLE)
       DEFS ${XRAY_COMMON_DEFINITIONS}
       DEPS ${XRAY_DEPS})
     list(APPEND XRAY_RTXRAY_ARCH_LIBS RTXray_${arch})
+    if (${arch} IN_LIST XRAY_DSO_SUPPORTED_ARCH)
+      add_compiler_rt_object_libraries(RTXrayDSO_${arch}
+        OS ${XRAY_SUPPORTED_OS}
+        ARCHS ${XRAY_DSO_SUPPORTED_ARCH}
+        SOURCES ${${arch}_DSO_SOURCES}
+        ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS}
+        CFLAGS ${XRAY_CFLAGS}
+        DEFS ${XRAY_COMMON_DEFINITIONS_DSO}
+        DEPS ${XRAY_DEPS})
+      list(APPEND XRAY_DSO_RTXRAY_ARCH_LIBS RTXrayDSO_${arch})
+    endif()
   endforeach()
   add_compiler_rt_object_libraries(RTXrayFDR
     OS ${XRAY_SUPPORTED_OS}
@@ -252,6 +286,17 @@ if (APPLE)
     LINK_FLAGS ${XRAY_LINK_FLAGS} ${WEAK_SYMBOL_LINK_FLAGS}
     LINK_LIBS ${XRAY_LINK_LIBS}
     PARENT_TARGET xray)
+  add_compiler_rt_runtime(clang_rt.xray-dso
+    STATIC
+    OS ${XRAY_SUPPORTED_OS}
+    ARCHS ${XRAY_DSO_SUPPORTED_ARCH}
+    OBJECT_LIBS RTXrayDSO ${XRAY_DSO_RTXRAY_ARCH_LIBS}
+    CFLAGS ${XRAY_CFLAGS}
+    DEFS ${XRAY_COMMON_DEFINITIONS}
+    LINK_FLAGS ${XRAY_LINK_FLAGS} ${WEAK_SYMBOL_LINK_FLAGS}
+    LINK_LIBS ${XRAY_LINK_LIBS}
+    PARENT_TARGET xray)
+
   add_compiler_rt_runtime(clang_rt.xray-fdr
     STATIC
     OS ${XRAY_SUPPORTED_OS}
@@ -346,16 +391,37 @@ else() # not Apple
       DEFS ${XRAY_COMMON_DEFINITIONS}
       OBJECT_LIBS RTXrayBASIC
       PARENT_TARGET xray)
-   # Profiler Mode runtime
-   add_compiler_rt_runtime(clang_rt.xray-profiling
-     STATIC
-     ARCHS ${arch}
-     CFLAGS ${XRAY_CFLAGS}
-     LINK_FLAGS ${XRAY_LINK_FLAGS}
-     LINK_LIBS ${XRAY_LINK_LIBS}
-     DEFS ${XRAY_COMMON_DEFINITIONS}
-     OBJECT_LIBS RTXrayPROFILING
-     PARENT_TARGET xray)
+    # Profiler Mode runtime
+    add_compiler_rt_runtime(clang_rt.xray-profiling
+      STATIC
+      ARCHS ${arch}
+      CFLAGS ${XRAY_CFLAGS}
+      LINK_FLAGS ${XRAY_LINK_FLAGS}
+      LINK_LIBS ${XRAY_LINK_LIBS}
+      DEFS ${XRAY_COMMON_DEFINITIONS}
+      OBJECT_LIBS RTXrayPROFILING
+      PARENT_TARGET xray)
+
+    if (${arch} IN_LIST XRAY_DSO_SUPPORTED_ARCH)
+      # TODO: Only implemented for X86 at the moment
+      add_compiler_rt_object_libraries(RTXrayDSO
+        ARCHS ${arch}
+        SOURCES ${XRAY_DSO_SOURCES} ${${arch}_DSO_SOURCES} 
+        ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS}
+        CFLAGS ${XRAY_CFLAGS}
+        DEFS ${XRAY_COMMON_DEFINITIONS_DSO}
+        DEPS ${XRAY_DEPS})
+      # DSO runtime archive
+      add_compiler_rt_runtime(clang_rt.xray-dso
+        STATIC
+        ARCHS ${arch}
+        CFLAGS ${XRAY_CFLAGS}
+        LINK_FLAGS ${XRAY_LINK_FLAGS}
+        LINK_LIBS ${XRAY_LINK_LIBS}
+        DEFS ${XRAY_COMMON_DEFINITIONS}
+        OBJECT_LIBS RTXrayDSO
+        PARENT_TARGET xray)
+    endif()
   endforeach()
 endif() # not Apple
 
diff --git a/compiler-rt/lib/xray/xray_dso_init.cpp b/compiler-rt/lib/xray/xray_dso_init.cpp
new file mode 100644
index 00000000000000..eb754db54c64fa
--- /dev/null
+++ b/compiler-rt/lib/xray/xray_dso_init.cpp
@@ -0,0 +1,62 @@
+//===-- xray_init.cpp -------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// XRay initialisation logic for DSOs.
+//===----------------------------------------------------------------------===//
+
+#include "sanitizer_common/sanitizer_atomic.h"
+#include "xray_defs.h"
+#include "xray_flags.h"
+#include "xray_interface_internal.h"
+
+using namespace __sanitizer;
+
+extern "C" {
+extern const XRaySledEntry __start_xray_instr_map[] __attribute__((weak))
+__attribute__((visibility("hidden")));
+extern const XRaySledEntry __stop_xray_instr_map[] __attribute__((weak))
+__attribute__((visibility("hidden")));
+extern const XRayFunctionSledIndex __start_xray_fn_idx[] __attribute__((weak))
+__attribute__((visibility("hidden")));
+extern const XRayFunctionSledIndex __stop_xray_fn_idx[] __attribute__((weak))
+__attribute__((visibility("hidden")));
+
+#if SANITIZER_APPLE
+// HACK: This is a temporary workaround to make XRay build on
+// Darwin, but it will probably not work at runtime.
+extern const XRaySledEntry __start_xray_instr_map[] = {};
+extern const XRaySledEntry __stop_xray_instr_map[] = {};
+extern const XRayFunctionSledIndex __start_xray_fn_idx[] = {};
+extern const XRayFunctionSledIndex __stop_xray_fn_idx[] = {};
+#endif
+}
+
+// Handler functions to call in the patched entry/exit sled.
+extern atomic_uintptr_t XRayPatchedFunction;
+extern atomic_uintptr_t XRayArgLogger;
+extern atomic_uintptr_t XRayPatchedCustomEvent;
+extern atomic_uintptr_t XRayPatchedTypedEvent;
+
+static int __xray_object_id{-1};
+
+// Note: .preinit_array initialization does not work for DSOs
+__attribute__((constructor(0))) static void
+__xray_init_dso() XRAY_NEVER_INSTRUMENT {
+  // Register sleds in main XRay runtime.
+  __xray_object_id =
+      __xray_register_dso(__start_xray_instr_map, __stop_xray_instr_map,
+                          __start_xray_fn_idx, __stop_xray_fn_idx, {});
+}
+
+__attribute__((destructor(0))) static void
+__xray_finalize_dso() XRAY_NEVER_INSTRUMENT {
+  // Inform the main runtime that this DSO is no longer used.
+  __xray_deregister_dso(__xray_object_id);
+}
diff --git a/compiler-rt/lib/xray/xray_init.cpp b/compiler-rt/lib/xray/xray_init.cpp
index f22a31b95686d0..53c93be89cd148 100644
--- a/compiler-rt/lib/xray/xray_init.cpp
+++ b/compiler-rt/lib/xray/xray_init.cpp
@@ -16,6 +16,8 @@
 #include <unistd.h>
 
 #include "sanitizer_common/sanitizer_common.h"
+#include "xray/xray_interface.h"
+#include "xray_allocator.h"
 #include "xray_defs.h"
 #include "xray_flags.h"
 #include "xray_interface_internal.h"
@@ -28,7 +30,7 @@ extern const XRayFunctionSledIndex __start_xray_fn_idx[] __attribute__((weak));
 extern const XRayFunctionSledIndex __stop_xray_fn_idx[] __attribute__((weak));
 
 #if SANITIZER_APPLE
-// HACK: This is a temporary workaround to make XRay build on 
+// HACK: This is a temporary workaround to make XRay build on
 // Darwin, but it will probably not work at runtime.
 const XRaySledEntry __start_xray_instr_map[] = {};
 extern const XRaySledEntry __stop_xray_instr_map[] = {};
@@ -43,14 +45,16 @@ using namespace __xray;
 // the weak symbols defined above (__start_xray_inst_map and
 // __stop_xray_instr_map) to initialise the instrumentation map that XRay uses
 // for runtime patching/unpatching of instrumentation points.
-//
-// FIXME: Support DSO instrumentation maps too. The current solution only works
-// for statically linked executables.
 atomic_uint8_t XRayInitialized{0};
 
 // This should always be updated before XRayInitialized is updated.
 SpinMutex XRayInstrMapMutex;
-XRaySledMap XRayInstrMap;
+
+//  Contains maps for the main executable as well as DSOs.
+XRaySledMap *XRayInstrMaps;
+
+// Number of binary objects registered.
+atomic_uint32_t XRayNumObjects{0};
 
 // Global flag to determine whether the flags have been initialized.
 atomic_uint8_t XRayFlagsInitialized{0};
@@ -58,6 +62,63 @@ atomic_uint8_t XRayFlagsInitialized{0};
 // A mutex to allow only one thread to initialize the XRay data structures.
 SpinMutex XRayInitMutex;
 
+// Registers XRay sleds and trampolines coming from the main executable or one
+// of the linked DSOs.
+// Returns the object ID if registration is successful, -1 otherwise.
+int32_t
+__xray_register_sleds(const XRaySledEntry *SledsBegin,
+                      const XRaySledEntry *SledsEnd,
+                      const XRayFunctionSledIndex *FnIndexBegin,
+                      const XRayFunctionSledIndex *FnIndexEnd, bool FromDSO,
+                      XRayTrampolines Trampolines) XRAY_NEVER_INSTRUMENT {
+  if (!SledsBegin || !SledsEnd) {
+    Report("Invalid XRay sleds.\n");
+    return -1;
+  }
+  XRaySledMap SledMap;
+  SledMap.FromDSO = FromDSO;
+  SledMap.Loaded = true;
+  SledMap.Trampolines = Trampolines;
+  SledMap.Sleds = SledsBegin;
+  SledMap.Entries = SledsEnd - SledsBegin;
+  if (FnIndexBegin != nullptr) {
+    SledMap.SledsIndex = FnIndexBegin;
+    SledMap.Functions = FnIndexEnd - FnIndexBegin;
+  } else {
+    size_t CountFunctions = 0;
+    uint64_t LastFnAddr = 0;
+
+    for (std::size_t I = 0; I < SledMap.Entries; I++) {
+      const auto &Sled = SledMap.Sleds[I];
+      const auto Function = Sled.function();
+      if (Function != LastFnAddr) {
+        CountFunctions++;
+        LastFnAddr = Function;
+      }
+    }
+    SledMap.SledsIndex = nullptr;
+    SledMap.Functions = CountFunctions;
+  }
+  if (SledMap.Functions >= XRayMaxFunctions) {
+    Report("Too many functions! Maximum is %ld\n", XRayMaxFunctions);
+    return -1;
+  }
+
+  if (Verbosity())
+    Report("Registering %d new functions!\n", SledMap.Functions);
+
+  {
+    SpinMutexLock Guard(&XRayInstrMapMutex);
+    auto Idx = atomic_fetch_add(&XRayNumObjects, 1, memory_order_acq_rel);
+    if (Idx >= XRayMaxObjects) {
+      Report("Too many objects registered! Maximum is %ld\n", XRayMaxObjects);
+      return -1;
+    }
+    XRayInstrMaps[Idx] = std::move(SledMap);
+    return Idx;
+  }
+}
+
 // __xray_init() will do the actual loading of the current process' memory map
 // and then proceed to look for the .xray_instr_map section/segment.
 void __xray_init() XRAY_NEVER_INSTRUMENT {
@@ -80,29 +141,21 @@ void __xray_init() XRAY_NEVER_INSTRUMENT {
     return;
   }
 
-  {
-    SpinMutexLock Guard(&XRayInstrMapMutex);
-    XRayInstrMap.Sleds = __start_xray_instr_map;
-    XRayInstrMap.Entries = __stop_xray_instr_map - __start_xray_instr_map;
-    if (__start_xray_fn_idx != nullptr) {
-      XRayInstrMap.SledsIndex = __start_xray_fn_idx;
-      XRayInstrMap.Functions = __stop_xray_fn_idx - __start_xray_fn_idx;
-    } else {
-      size_t CountFunctions = 0;
-      uint64_t LastFnAddr = 0;
-
-      for (std::size_t I = 0; I < XRayInstrMap.Entries; I++) {
-        const auto &Sled = XRayInstrMap.Sleds[I];
-        const auto Function = Sled.function();
-        if (Function != LastFnAddr) {
-          CountFunctions++;
-          LastFnAddr = Function;
-        }
-      }
+  atomic_store(&XRayNumObjects, 0, memory_order_release);
 
-      XRayInstrMap.Functions = CountFunctions;
-    }
+  // Pre-allocation takes up approx. 5kB for XRayMaxObjects=64.
+  XRayInstrMaps = allocateBuffer<XRaySledMap>(XRayMaxObjects);
+
+  int MainBinaryId =
+      __xray_register_sleds(__start_xray_instr_map, __stop_xray_instr_map,
+                            __start_xray_fn_idx, __stop_xray_fn_idx, false, {});
+
+  // The executable should always get ID 0.
+  if (MainBinaryId != 0) {
+    Report("Registering XRay sleds failed.\n");
+    return;
   }
+
   atomic_store(&XRayInitialized, true, memory_order_release);
 
 #ifndef XRAY_NO_PREINIT
@@ -111,6 +164,84 @@ void __xray_init() XRAY_NEVER_INSTRUMENT {
 #endif
 }
 
+// Registers XRay sleds and trampolines of an instrumented DSO.
+// Returns the object ID if registration is successful, -1 otherwise.
+//
+// Default visibility is hidden, so we have to explicitly make it visible to
+// DSO.
+SANITIZER_INTERFACE_ATTRIBUTE int32_t __xray_register_dso(
+    const XRaySledEntry *SledsBegin, const XRaySledEntry *SledsEnd,
+    const XRayFunctionSledIndex *FnIndexBegin,
+    const XRayFunctionSledIndex *FnIndexEnd,
+    XRayTrampolines Trampolines) XRAY_NEVER_INSTRUMENT {
+  // Make sure XRay has been initialized in the main executable.
+  __xray_init();
+
+  if (__xray_num_objects() == 0) {
+    if (Verbosity())
+      Report("No XRay instrumentation map in main executable. Not initializing "
+             "XRay for DSO.\n");
+    return -1;
+  }
+
+  // Register sleds in global map.
+  int ObjId = __xray_register_sleds(SledsBegin, SledsEnd, FnIndexBegin,
+                                    FnIndexEnd, true, Trampolines);
+
+#ifndef XRAY_NO_PREINIT
+  if (ObjId >= 0 && flags()->patch_premain)
+    __xray_patch_object(ObjId);
+#endif
+
+  return ObjId;
+}
+
+// Deregisters a DSO from the main XRay runtime.
+// Called from the DSO-local runtime when the library is unloaded (e.g. if
+// dlclose is called).
+// Returns true if the object ID is valid and the DSO was successfully
+// deregistered.
+SANITIZER_INTERFACE_ATTRIBUTE bool
+__xray_deregister_dso(int32_t ObjId) XRAY_NEVER_INSTRUMENT {
+
+  if (!atomic_load(&XRayInitialized, memory_order_acquire)) {
+    if (Verbosity())
+      Report("XRay has not been initialized. Cannot deregister DSO.\n");
+    return false;
+  }
+
+  if (ObjId <= 0 || ObjId >= __xray_num_objects()) {
+    if (Verbosity())
+      Report("Can't deregister object with ID %d: ID is invalid.\n", ObjId);
+    return false;
+  }
+
+  {
+    SpinMutexLock Guard(&XRayInstrMapMutex);
+    auto &Entry = XRayInstrMaps[ObjId];
+    if (!Entry.FromDSO) {
+      if (Verbosity())
+        Report("Can't deregister object with ID %d: object does not correspond "
+               "to a shared library.\n",
+               ObjId);
+      return false;
+    }
+    if (!Entry.Loaded) {
+      if (Verbosity())
+        Report("Can't deregister object with ID %d: object is not loaded.\n",
+               ObjId);
+      return true;
+    }
+    // Mark DSO as unloaded. No need to unpatch.
+    Entry.Loaded = false;
+  }
+
+  if (Verbosity())
+    Report("Deregistered object with ID %d.\n", ObjId);
+
+  return true;
+}
+
 // FIXME: Make check-xray tests work on FreeBSD without
 // SANITIZER_CAN_USE_PREINIT_ARRAY.
 // See sanitizer_internal_defs.h where the macro is defined.
diff --git a/compiler-rt/lib/xray/xray_interface.cpp b/compiler-rt/lib/xray/xray_interface.cpp
index 5839043fcb93a8..16e60bfc22cd10 100644
--- a/compiler-rt/lib/xray/xray_interface.cpp
+++ b/compiler-rt/lib/xray/xray_interface.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "xray_interface_internal.h"
+#include "llvm/Support/ErrorHandling.h"
 
 #include <cinttypes>
 #include <cstdio>
@@ -36,7 +37,8 @@
 
 extern __sanitizer::SpinMutex XRayInstrMapMutex;
 extern __sanitizer::atomic_uint8_t XRayInitialized;
-extern __xray::XRaySledMap XRayInstrMap;
+extern __xray::XRaySledMap *XRayInstrMaps;
+extern __sanitizer::atomic_uint32_t XRayNumObjects;
 
 namespace __xray {
 
@@ -61,16 +63,16 @@ static const int16_t cSledLength = 20;
 #endif /* CPU architecture */
 
 // This is the function to call when we encounter the entry or exit sleds.
-atomic_uintptr_t XRayPatchedFunction{0};
+atomic_uintptr_t XRayPatchedFunction SANITIZER_INTERFACE_ATTRIBUTE{0};
 
 // This is the function to call from the arg1-enabled sleds/trampolines.
-atomic_uintptr_t XRayArgLogger{0};
+atomic_uintptr_t XRayArgLogger SANITIZER_INTERFACE_ATTRIBUTE{0};
 
 // This is the function to call when we encounter a custom event log call.
-atomic_uintptr_t XRayPatchedCustomEvent{0};
+atomic_uintptr_t XRayPatchedCustomEvent SANITIZER_INTERFACE_ATTRIBUTE{0};
 
 // This is the function to call when we encounter a typed event log call.
-atomic_uintptr_t XRayPatchedTypedEvent{0};
+atomic_uintptr_t XRayPatchedTypedEvent SANITIZER_INTERFACE_ATTRIBUTE{0};
 
 // This is the global status to determine whether we are currently
 // patching/unpatching.
@@ -150,27 +152,42 @@ class MProtectHelper {
 
 namespace {
 
-bool patchSled(const XRaySledEntry &Sled, bool Enable,
-               int32_t FuncId) XRAY_NEVER_INSTRUMENT {
+bool isObjectLoaded(int32_t ObjId) {
+  SpinMutexLock Guard(&XRayInstrMapMutex);
+  if (ObjId < 0 ||
+      ObjId >= atomic_load(&XRayNumObjects, memory_order_acquire)) {
+    return false;
+  }
+  return XRayInstrMaps[ObjId].Loaded;
+}
+
+bool patchSled(const XRaySledEntry &Sled, bool Enable, int32_t FuncId,
+               const XRayTrampolines &Trampolines) XRAY_NEVER_INSTRUMENT {
   bool Success = false;
   switch (Sled.Kind) {
   case XRayEntryType::ENTRY:
-    Success = patchFunctionEntry(Enable, FuncId, Sled, __xray_FunctionEntry);
+    Success =
+        patchFunctionEntry(Enable, FuncId, Sled, Trampolines.EntryTrampoline);
     break;
   case XRayEntryType::EXIT:
-    Success = patchFunctionExit(Enable, FuncId, Sled);
+    Success =
+        patchFunctionExit(Enable, FuncId, Sled, Trampolines.ExitTrampoline);
     break;
   case XRayEntryType::TAIL:
-    Success = patchFunctionTailExit(Enable, FuncId, Sled);
+    Success = patchFunctionTailExit(Enable, FuncId, Sled,
+                                    Trampolines.TailExitTrampoline);
     break;
   case XRayEntryType::LOG_ARGS_ENTRY:
-    Success = patchFunctionEntry(Enable, FuncId, Sled, __xray_ArgLoggerEntry);
+    Success =
+        patchFunctionEntry(Enable, FuncId, Sled, Trampolines.LogArgsTrampoline);
     break;
   case XRayEntryType::CUSTOM_EVENT:
-    Success = patchCustomEvent(Enable, FuncId, Sled);
+    Success = patchCustomEvent(Enable, FuncId, Sled,
+                               Trampolines.CustomEventTrampoline);
     break;
   case XRayEntryType::TYPED_EVENT:
-    Success = patchTypedEvent(Enable, FuncId, Sled);
+    Success =
+        patchTypedEvent(Enable, FuncId, Sled, Trampolines.TypedEventTrampoline);
     break;
   default:
     Report("Unsupported sled kind '%" PRIu64 "' @%04x\n", Sled.Address,
@@ -205,10 +222,9 @@ findFunctionSleds(int32_t FuncId,
   return Index;
 }
 
-XRayPatchingStatus patchFunction(int32_t FuncId,
+XRayPatchingStatus patchFunction(int32_t FuncId, int32_t ObjId,
                                  bool Enable) XRAY_NEVER_INSTRUMENT {
-  if (!atomic_load(&XRayInitialized,
-                                memory_order_acquire))
+  if (!atomic_load(&XRayInitialized, memory_order_acquire))
     return XRayPatchingStatus::NOT_INITIALIZED; // Not initialized.
 
   uint8_t NotPatching = false;
@@ -220,13 +236,24 @@ XRayPatchingStatus patchFunction(int32_t FuncId,
   XRaySledMap InstrMap;
   {
     SpinMutexLock Guard(&XRayInstrMapMutex);
-    InstrMap = XRayInstrMap;
+    if (ObjId < 0 ||
+        ObjId >= atomic_load(&XRayNumObjects, memory_order_acquire)) {
+      Report("Unable to patch function: invalid sled map index: %d", ObjId);
+      return XRayPatchingStatus::FAILED;
+    }
+    InstrMap = XRayInstrMaps[ObjId];
   }
 
   // If we don't have an index, we can't patch individual functions.
   if (InstrMap.Functions == 0)
     return XRayPatchingStatus::NOT_INITIALIZED;
 
+  // Check if the corresponding DSO has been unloaded.
+  if (!InstrMap.Loaded) {
+    Report("Invalid function id provided: %d\n", FuncId);
+    return XRayPatchingStatus::NOT_INITIALIZED;
+  }
+
   // FuncId must be a positive number, less than the number of functions
   // instrumented.
   if (FuncId <= 0 || static_cast<size_t>(FuncId) > InstrMap.Functions) {
@@ -234,6 +261,8 @@ XRayPatchingStatus patchFunction(int32_t FuncId,
     return XRayPatchingStatus::FAILED;
   }
 
+  auto PackedId = __xray::MakePackedId(FuncId, ObjId);
+
   // Now we patch ths sleds for this specific function.
   XRayFunctionSledIndex SledRange;
   if (InstrMap.SledsIndex) {
@@ -242,13 +271,13 @@ XRayPatchingStatus patchFunction(int32_t FuncId,
   } else {
     SledRange = findFunctionSleds(FuncId, InstrMap);
   }
+
   auto *f = SledRange.Begin;
   bool SucceedOnce = false;
   for (size_t i = 0; i != SledRange.Size; ++i)
-    SucceedOnce |= patchSled(f[i], Enable, FuncId);
+    SucceedOnce |= patchSled(f[i], Enable, PackedId, InstrMap.Trampolines);
 
-  atomic_store(&XRayPatching, false,
-                            memory_order_release);
+  atomic_store(&XRayPatching, false, memory_order_release);
 
   if (!SucceedOnce) {
     Report("Failed patching any sled for function '%d'.", FuncId);
@@ -261,32 +290,31 @@ XRayPatchingStatus patchFunction(int32_t FuncId,
 // controlPatching implements the common internals of the patching/unpatching
 // implementation. |Enable| defines whether we're enabling or disabling the
 // runtime XRay instrumentation.
-XRayPatchingStatus controlPatching(bool Enable) XRAY_NEVER_INSTRUMENT {
-  if (!atomic_load(&XRayInitialized,
-                                memory_order_acquire))
-    return XRayPatchingStatus::NOT_INITIALIZED; // Not initialized.
-
-  uint8_t NotPatching = false;
-  if (!atomic_compare_exchange_strong(
-          &XRayPatching, &NotPatching, true, memory_order_acq_rel))
-    return XRayPatchingStatus::ONGOING; // Already patching.
-
-  uint8_t PatchingSuccess = false;
-  auto XRayPatchingStatusResetter =
-      at_scope_exit([&PatchingSuccess] {
-        if (!PatchingSuccess)
-          atomic_store(&XRayPatching, false,
-                                    memory_order_release);
-      });
-
+// This function should only be called after ensuring that XRay is initialized
+// and no other thread is currently patching.
+XRayPatchingStatus controlPatchingObjectUnchecked(bool Enable, int32_t ObjId) {
   XRaySledMap InstrMap;
   {
     SpinMutexLock Guard(&XRayInstrMapMutex);
-    InstrMap = XRayInstrMap;
+    if (ObjId < 0 ||
+        ObjId >= atomic_load(&XRayNumObjects, memory_order_acquire)) {
+      Report("Unable to patch functions: invalid sled map index: %d\n", ObjId);
+      return XRayPatchingStatus::FAILED;
+    }
+    InstrMap = XRayInstrMaps[ObjId];
   }
   if (InstrMap.Entries == 0)
     return XRayPatchingStatus::NOT_INITIALIZED;
 
+  if (Verbosity())
+    Report("Patching object %d with %d functions.\n", ObjId, InstrMap.Entries);
+
+  // Check if the corresponding DSO has been unloaded.
+  if (!InstrMap.Loaded) {
+    Report("Object is not loaded at index: %d\n", ObjId);
+    return XRayPatchingStatus::FAILED;
+  }
+
   uint32_t FuncId = 1;
   uint64_t CurFun = 0;
 
@@ -336,20 +364,96 @@ XRayPatchingStatus controlPatching(bool Enable) XRAY_NEVER_INSTRUMENT {
       ++FuncId;
       CurFun = F;
     }
-    patchSled(Sled, Enable, FuncId);
+    auto PackedId = __xray::MakePackedId(FuncId, ObjId);
+    patchSled(Sled, Enable, PackedId, InstrMap.Trampolines);
   }
-  atomic_store(&XRayPatching, false,
-                            memory_order_release);
-  PatchingSuccess = true;
+  atomic_store(&XRayPatching, false, memory_order_release);
   return XRayPatchingStatus::SUCCESS;
 }
 
-XRayPatchingStatus mprotectAndPatchFunction(int32_t FuncId,
+// Controls patching for all registered objects.
+// Returns: SUCCESS, if patching succeeds for all objects.
+//          NOT_INITIALIZED, if one or more objects returned NOT_INITIALIZED
+//             but none failed.
+//          FAILED, if patching of one or more objects failed.
+XRayPatchingStatus controlPatching(bool Enable) XRAY_NEVER_INSTRUMENT {
+  if (!atomic_load(&XRayInitialized, memory_order_acquire))
+    return XRayPatchingStatus::NOT_INITIALIZED; // Not initialized.
+
+  uint8_t NotPatching = false;
+  if (!atomic_compare_exchange_strong(&XRayPatching, &NotPatching, true,
+                                      memory_order_acq_rel))
+    return XRayPatchingStatus::ONGOING; // Already patching.
+
+  auto XRayPatchingStatusResetter = at_scope_exit(
+      [] { atomic_store(&XRayPatching, false, memory_order_release); });
+
+  unsigned NumObjects = __xray_num_objects();
+
+  XRayPatchingStatus CombinedStatus{NOT_INITIALIZED};
+  for (unsigned I = 0; I < NumObjects; ++I) {
+    if (!isObjectLoaded(I))
+      continue;
+    auto LastStatus = controlPatchingObjectUnchecked(Enable, I);
+    switch (LastStatus) {
+    case SUCCESS:
+      if (CombinedStatus == NOT_INITIALIZED)
+        CombinedStatus = SUCCESS;
+      break;
+    case FAILED:
+      // Report failure, but try to patch the remaining objects
+      CombinedStatus = FAILED;
+      break;
+    case NOT_INITIALIZED:
+      // XRay has been initialized but there are no sleds available for this
+      // object. Try to patch remaining objects.
+      if (CombinedStatus != FAILED)
+        CombinedStatus = NOT_INITIALIZED;
+      break;
+    case ONGOING:
+      llvm_unreachable("Status ONGOING should not appear at this point");
+    default:
+      llvm_unreachable("Unhandled patching status");
+    }
+  }
+  return CombinedStatus;
+}
+
+// Controls patching for one object.
+XRayPatchingStatus controlPatching(bool Enable,
+                                   int32_t ObjId) XRAY_NEVER_INSTRUMENT {
+
+  if (!atomic_load(&XRayInitialized, memory_order_acquire))
+    return XRayPatchingStatus::NOT_INITIALIZED; // Not initialized.
+
+  uint8_t NotPatching = false;
+  if (!atomic_compare_exchange_strong(&XRayPatching, &NotPatching, true,
+                                      memory_order_acq_rel))
+    return XRayPatchingStatus::ONGOING; // Already patching.
+
+  auto XRayPatchingStatusResetter = at_scope_exit(
+      [] { atomic_store(&XRayPatching, false, memory_order_release); });
+
+  return controlPatchingObjectUnchecked(Enable, ObjId);
+}
+
+XRayPatchingStatus mprotectAndPatchFunction(int32_t FuncId, int32_t ObjId,
                                             bool Enable) XRAY_NEVER_INSTRUMENT {
   XRaySledMap InstrMap;
   {
     SpinMutexLock Guard(&XRayInstrMapMutex);
-    InstrMap = XRayInstrMap;
+    if (ObjId < 0 ||
+        ObjId >= atomic_load(&XRayNumObjects, memory_order_acquire)) {
+      Report("Unable to patch function: invalid sled map index: %d\n", ObjId);
+      return XRayPatchingStatus::FAILED;
+    }
+    InstrMap = XRayInstrMaps[ObjId];
+  }
+
+  // Check if the corresponding DSO has been unloaded.
+  if (!InstrMap.Loaded) {
+    Report("Object is not loaded at index: %d\n", ObjId);
+    return XRayPatchingStatus::FAILED;
   }
 
   // FuncId must be a positive number, less than the number of functions
@@ -398,7 +502,7 @@ XRayPatchingStatus mprotectAndPatchFunction(int32_t FuncId,
     Report("Failed mprotect: %d\n", errno);
     return XRayPatchingStatus::FAILED;
   }
-  return patchFunction(FuncId, Enable);
+  return patchFunction(FuncId, ObjId, Enable);
 }
 
 } // namespace
@@ -412,12 +516,10 @@ using namespace __xray;
 
 int __xray_set_handler(void (*entry)(int32_t,
                                      XRayEntryType)) XRAY_NEVER_INSTRUMENT {
-  if (atomic_load(&XRayInitialized,
-                               memory_order_acquire)) {
+  if (atomic_load(&XRayInitialized, memory_order_acquire)) {
 
     atomic_store(&__xray::XRayPatchedFunction,
-                              reinterpret_cast<uintptr_t>(entry),
-                              memory_order_release);
+                 reinterpret_cast<uintptr_t>(entry), memory_order_release);
     return 1;
   }
   return 0;
@@ -425,11 +527,9 @@ int __xray_set_handler(void (*entry)(int32_t,
 
 int __xray_set_customevent_handler(void (*entry)(void *, size_t))
     XRAY_NEVER_INSTRUMENT {
-  if (atomic_load(&XRayInitialized,
-                               memory_order_acquire)) {
+  if (atomic_load(&XRayInitialized, memory_order_acquire)) {
     atomic_store(&__xray::XRayPatchedCustomEvent,
-                              reinterpret_cast<uintptr_t>(entry),
-                              memory_order_release);
+                 reinterpret_cast<uintptr_t>(entry), memory_order_release);
     return 1;
   }
   return 0;
@@ -437,11 +537,9 @@ int __xray_set_customevent_handler(void (*entry)(void *, size_t))
 
 int __xray_set_typedevent_handler(void (*entry)(size_t, const void *,
                                                 size_t)) XRAY_NEVER_INSTRUMENT {
-  if (atomic_load(&XRayInitialized,
-                               memory_order_acquire)) {
+  if (atomic_load(&XRayInitialized, memory_order_acquire)) {
     atomic_store(&__xray::XRayPatchedTypedEvent,
-                              reinterpret_cast<uintptr_t>(entry),
-                              memory_order_release);
+                 reinterpret_cast<uintptr_t>(entry), memory_order_release);
     return 1;
   }
   return 0;
@@ -474,39 +572,78 @@ XRayPatchingStatus __xray_patch() XRAY_NEVER_INSTRUMENT {
   return controlPatching(true);
 }
 
+XRayPatchingStatus __xray_patch_object(int32_t ObjId) XRAY_NEVER_INSTRUMENT {
+  return controlPatching(true, ObjId);
+}
+
 XRayPatchingStatus __xray_unpatch() XRAY_NEVER_INSTRUMENT {
   return controlPatching(false);
 }
 
+XRayPatchingStatus __xray_unpatch_object(int32_t ObjId) XRAY_NEVER_INSTRUMENT {
+  return controlPatching(false, ObjId);
+}
+
 XRayPatchingStatus __xray_patch_function(int32_t FuncId) XRAY_NEVER_INSTRUMENT {
-  return mprotectAndPatchFunction(FuncId, true);
+  auto Ids = __xray::UnpackId(FuncId);
+  auto ObjId = Ids.first;
+  auto FnId = Ids.second;
+  return mprotectAndPatchFunction(FnId, ObjId, true);
+}
+
+XRayPatchingStatus
+__xray_patch_function_in_object(int32_t FuncId,
+                                int32_t ObjId) XRAY_NEVER_INSTRUMENT {
+  return mprotectAndPatchFunction(FuncId, ObjId, true);
 }
 
 XRayPatchingStatus
 __xray_unpatch_function(int32_t FuncId) XRAY_NEVER_INSTRUMENT {
-  return mprotectAndPatchFunction(FuncId, false);
+  auto Ids = __xray::UnpackId(FuncId);
+  auto ObjId = Ids.first;
+  auto FnId = Ids.second;
+  return mprotectAndPatchFunction(FnId, ObjId, false);
+}
+
+XRayPatchingStatus
+__xray_unpatch_function_in_object(int32_t FuncId,
+                                  int32_t ObjId) XRAY_NEVER_INSTRUMENT {
+  return mprotectAndPatchFunction(FuncId, ObjId, false);
 }
 
 int __xray_set_handler_arg1(void (*entry)(int32_t, XRayEntryType, uint64_t)) {
-  if (!atomic_load(&XRayInitialized,
-                                memory_order_acquire))
+  if (!atomic_load(&XRayInitialized, memory_order_acquire))
     return 0;
 
   // A relaxed write might not be visible even if the current thread gets
   // scheduled on a different CPU/NUMA node.  We need to wait for everyone to
   // have this handler installed for consistency of collected data across CPUs.
   atomic_store(&XRayArgLogger, reinterpret_cast<uint64_t>(entry),
-                            memory_order_release);
+               memory_order_release);
   return 1;
 }
 
 int __xray_remove_handler_arg1() { return __xray_set_handler_arg1(nullptr); }
 
-uintptr_t __xray_function_address(int32_t FuncId) XRAY_NEVER_INSTRUMENT {
+uintptr_t
+__xray_function_address(int32_t CombinedFuncId) XRAY_NEVER_INSTRUMENT {
+  auto Ids = __xray::UnpackId(CombinedFuncId);
+  return __xray_function_address_in_object(Ids.second, Ids.first);
+}
+
+uintptr_t __xray_function_address_in_object(int32_t FuncId, int32_t ObjId)
+    XRAY_NEVER_INSTRUMENT {
   XRaySledMap InstrMap;
   {
     SpinMutexLock Guard(&XRayInstrMapMutex);
-    InstrMap = XRayInstrMap;
+    auto count = atomic_load(&XRayNumObjects, memory_order_acquire);
+    if (ObjId < 0 || ObjId >= count) {
+      Report("Unable to determine function address: invalid sled map index %d "
+             "(size is %d)\n",
+             ObjId, (int)count);
+      return 0;
+    }
+    InstrMap = XRayInstrMaps[ObjId];
   }
 
   if (FuncId <= 0 || static_cast<size_t>(FuncId) > InstrMap.Functions)
@@ -525,6 +662,29 @@ uintptr_t __xray_function_address(int32_t FuncId) XRAY_NEVER_INSTRUMENT {
 }
 
 size_t __xray_max_function_id() XRAY_NEVER_INSTRUMENT {
+  return __xray_max_function_id_in_object(0);
+}
+
+size_t __xray_max_function_id_in_object(int32_t ObjId) XRAY_NEVER_INSTRUMENT {
+  SpinMutexLock Guard(&XRayInstrMapMutex);
+  if (ObjId < 0 || ObjId >= atomic_load(&XRayNumObjects, memory_order_acquire))
+    return 0;
+  return XRayInstrMaps[ObjId].Functions;
+}
+
+size_t __xray_num_objects() XRAY_NEVER_INSTRUMENT {
   SpinMutexLock Guard(&XRayInstrMapMutex);
-  return XRayInstrMap.Functions;
+  return atomic_load(&XRayNumObjects, memory_order_acquire);
+}
+
+int32_t __xray_unpack_function_id(int32_t PackedId) {
+  return __xray::UnpackId(PackedId).second;
+}
+
+int32_t __xray_unpack_object_id(int32_t PackedId) {
+  return __xray::UnpackId(PackedId).first;
+}
+
+int32_t __xray_pack_id(int32_t FuncId, int32_t ObjId) {
+  return __xray::MakePackedId(FuncId, ObjId);
 }
diff --git a/compiler-rt/lib/xray/xray_interface_internal.h b/compiler-rt/lib/xray/xray_interface_internal.h
index 80c07c167f6461..5fbaa9c3f315b1 100644
--- a/compiler-rt/lib/xray/xray_interface_internal.h
+++ b/compiler-rt/lib/xray/xray_interface_internal.h
@@ -18,6 +18,18 @@
 #include "xray/xray_interface.h"
 #include <cstddef>
 #include <cstdint>
+#include <utility>
+
+extern "C" {
+// The following functions have to be defined in assembler, on a per-platform
+// basis. See xray_trampoline_*.S files for implementations.
+extern void __xray_FunctionEntry();
+extern void __xray_FunctionExit();
+extern void __xray_FunctionTailExit();
+extern void __xray_ArgLoggerEntry();
+extern void __xray_CustomEvent();
+extern void __xray_TypedEvent();
+}
 
 extern "C" {
 
@@ -67,36 +79,77 @@ struct XRayFunctionSledIndex {
                                                    uintptr_t(Begin));
   }
 };
+
+struct XRayTrampolines {
+  void (*EntryTrampoline)();
+  void (*ExitTrampoline)();
+  void (*TailExitTrampoline)();
+  void (*LogArgsTrampoline)();
+  void (*CustomEventTrampoline)();
+  void (*TypedEventTrampoline)();
+
+  XRayTrampolines() {
+    // These resolve to the definitions in the respective executable or DSO.
+    EntryTrampoline = __xray_FunctionEntry;
+    ExitTrampoline = __xray_FunctionExit;
+    TailExitTrampoline = __xray_FunctionTailExit;
+    LogArgsTrampoline = __xray_ArgLoggerEntry;
+    CustomEventTrampoline = __xray_CustomEvent;
+    TypedEventTrampoline = __xray_TypedEvent;
+  }
+};
+
+extern int32_t __xray_register_dso(const XRaySledEntry *SledsBegin,
+                                   const XRaySledEntry *SledsEnd,
+                                   const XRayFunctionSledIndex *FnIndexBegin,
+                                   const XRayFunctionSledIndex *FnIndexEnd,
+                                   XRayTrampolines Trampolines);
+
+extern bool __xray_deregister_dso(int32_t ObjId);
 }
 
 namespace __xray {
 
+constexpr uint32_t XRayNFnBits = 24;
+constexpr uint32_t XRayNObjBits = 8;
+
+constexpr uint32_t XRayFnBitMask = 0x00FFFFFF;
+constexpr uint32_t XRayObjBitMask = 0xFF000000;
+
+constexpr size_t XRayMaxFunctions = 1 << XRayNFnBits;
+constexpr size_t XRayMaxObjects = 1 << XRayNObjBits;
+
+inline int32_t MakePackedId(int32_t FnId, int32_t ObjId) {
+  return ((ObjId << XRayNFnBits) & XRayObjBitMask) | (FnId & XRayFnBitMask);
+}
+
+inline std::pair<int32_t, int32_t> UnpackId(int32_t PackedId) {
+  uint32_t ObjId = (PackedId & XRayObjBitMask) >> XRayNFnBits;
+  uint32_t FnId = PackedId & XRayFnBitMask;
+  return {ObjId, FnId};
+}
+
 struct XRaySledMap {
   const XRaySledEntry *Sleds;
   size_t Entries;
   const XRayFunctionSledIndex *SledsIndex;
   size_t Functions;
+  XRayTrampolines Trampolines;
+  bool FromDSO;
+  bool Loaded;
 };
 
 bool patchFunctionEntry(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled,
                         void (*Trampoline)());
-bool patchFunctionExit(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled);
+bool patchFunctionExit(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled,
+                       void (*Trampoline)());
 bool patchFunctionTailExit(bool Enable, uint32_t FuncId,
-                           const XRaySledEntry &Sled);
-bool patchCustomEvent(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled);
-bool patchTypedEvent(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled);
+                           const XRaySledEntry &Sled, void (*Trampoline)());
+bool patchCustomEvent(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled,
+                      void (*Trampoline)());
+bool patchTypedEvent(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled,
+                     void (*Trampoline)());
 
 } // namespace __xray
 
-extern "C" {
-// The following functions have to be defined in assembler, on a per-platform
-// basis. See xray_trampoline_*.S files for implementations.
-extern void __xray_FunctionEntry();
-extern void __xray_FunctionExit();
-extern void __xray_FunctionTailExit();
-extern void __xray_ArgLoggerEntry();
-extern void __xray_CustomEvent();
-extern void __xray_TypedEvent();
-}
-
 #endif
diff --git a/compiler-rt/lib/xray/xray_trampoline_x86_64.S b/compiler-rt/lib/xray/xray_trampoline_x86_64.S
index 01098f60eeab8b..0f480547b52cc6 100644
--- a/compiler-rt/lib/xray/xray_trampoline_x86_64.S
+++ b/compiler-rt/lib/xray/xray_trampoline_x86_64.S
@@ -107,6 +107,16 @@
 	.section __TEXT,__text
 #endif
 
+.macro LOAD_HANDLER_ADDR handler
+#if !defined(XRAY_PIC)
+	movq	ASM_SYMBOL(\handler)(%rip), %rax
+#else
+	movq	ASM_SYMBOL(\handler)@GOTPCREL(%rip), %rax
+	movq	(%rax), %rax
+#endif
+.endm
+
+
 //===----------------------------------------------------------------------===//
 
 	.globl ASM_SYMBOL(__xray_FunctionEntry)
@@ -121,7 +131,7 @@ ASM_SYMBOL(__xray_FunctionEntry):
 
 	// This load has to be atomic, it's concurrent with __xray_patch().
 	// On x86/amd64, a simple (type-aligned) MOV instruction is enough.
-	movq	ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)(%rip), %rax
+	LOAD_HANDLER_ADDR _ZN6__xray19XRayPatchedFunctionE
 	testq	%rax, %rax
 	je	LOCAL_LABEL(tmp0)
 
@@ -159,7 +169,7 @@ ASM_SYMBOL(__xray_FunctionExit):
 	movupd	%xmm1, 16(%rsp)
 	movq	%rax, 8(%rsp)
 	movq	%rdx, 0(%rsp)
-	movq	ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)(%rip), %rax
+	LOAD_HANDLER_ADDR _ZN6__xray19XRayPatchedFunctionE
 	testq %rax,%rax
 	je	LOCAL_LABEL(tmp2)
 
@@ -195,7 +205,7 @@ ASM_SYMBOL(__xray_FunctionTailExit):
 	SAVE_REGISTERS
 	ALIGN_STACK_16B
 
-	movq	ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)(%rip), %rax
+	LOAD_HANDLER_ADDR _ZN6__xray19XRayPatchedFunctionE
 	testq %rax,%rax
 	je	LOCAL_LABEL(tmp4)
 
@@ -224,12 +234,12 @@ ASM_SYMBOL(__xray_ArgLoggerEntry):
 	ALIGN_STACK_16B
 
 	// Again, these function pointer loads must be atomic; MOV is fine.
-	movq	ASM_SYMBOL(_ZN6__xray13XRayArgLoggerE)(%rip), %rax
+	LOAD_HANDLER_ADDR _ZN6__xray13XRayArgLoggerE
 	testq	%rax, %rax
 	jne	LOCAL_LABEL(arg1entryLog)
 
 	// If [arg1 logging handler] not set, defer to no-arg logging.
-	movq	ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)(%rip), %rax
+	LOAD_HANDLER_ADDR _ZN6__xray19XRayPatchedFunctionE
 	testq	%rax, %rax
 	je	LOCAL_LABEL(arg1entryFail)
 
@@ -268,7 +278,7 @@ ASM_SYMBOL(__xray_CustomEvent):
 
 	// We take two arguments to this trampoline, which should be in rdi	and rsi
 	// already.
-	movq ASM_SYMBOL(_ZN6__xray22XRayPatchedCustomEventE)(%rip), %rax
+	LOAD_HANDLER_ADDR _ZN6__xray22XRayPatchedCustomEventE
 	testq %rax,%rax
 	je LOCAL_LABEL(customEventCleanup)
 
@@ -293,7 +303,7 @@ ASM_SYMBOL(__xray_TypedEvent):
 
 	// We pass three arguments to this trampoline, which should be in rdi, rsi
 	// and rdx without our intervention.
-	movq ASM_SYMBOL(_ZN6__xray21XRayPatchedTypedEventE)(%rip), %rax
+	LOAD_HANDLER_ADDR _ZN6__xray21XRayPatchedTypedEventE
 	testq %rax,%rax
 	je LOCAL_LABEL(typedEventCleanup)
 
diff --git a/compiler-rt/lib/xray/xray_x86_64.cpp b/compiler-rt/lib/xray/xray_x86_64.cpp
index b9666a40861d48..663a51b2686614 100644
--- a/compiler-rt/lib/xray/xray_x86_64.cpp
+++ b/compiler-rt/lib/xray/xray_x86_64.cpp
@@ -170,7 +170,8 @@ bool patchFunctionEntry(const bool Enable, const uint32_t FuncId,
 }
 
 bool patchFunctionExit(const bool Enable, const uint32_t FuncId,
-                       const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+                       const XRaySledEntry &Sled,
+                       void (*Trampoline)()) XRAY_NEVER_INSTRUMENT {
   // Here we do the dance of replacing the following sled:
   //
   // xray_sled_n:
@@ -192,11 +193,11 @@ bool patchFunctionExit(const bool Enable, const uint32_t FuncId,
   // Prerequisite is to compute the relative offset fo the
   // __xray_FunctionExit function's address.
   const uint64_t Address = Sled.address();
-  int64_t TrampolineOffset = reinterpret_cast<int64_t>(__xray_FunctionExit) -
+  int64_t TrampolineOffset = reinterpret_cast<int64_t>(Trampoline) -
                              (static_cast<int64_t>(Address) + 11);
   if (TrampolineOffset < MinOffset || TrampolineOffset > MaxOffset) {
     Report("XRay Exit trampoline (%p) too far from sled (%p)\n",
-           reinterpret_cast<void *>(__xray_FunctionExit),
+           reinterpret_cast<void *>(Trampoline),
            reinterpret_cast<void *>(Address));
     return false;
   }
@@ -217,16 +218,16 @@ bool patchFunctionExit(const bool Enable, const uint32_t FuncId,
 }
 
 bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
-                           const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+                           const XRaySledEntry &Sled,
+                           void (*Trampoline)()) XRAY_NEVER_INSTRUMENT {
   // Here we do the dance of replacing the tail call sled with a similar
   // sequence as the entry sled, but calls the tail exit sled instead.
   const uint64_t Address = Sled.address();
-  int64_t TrampolineOffset =
-      reinterpret_cast<int64_t>(__xray_FunctionTailExit) -
-      (static_cast<int64_t>(Address) + 11);
+  int64_t TrampolineOffset = reinterpret_cast<int64_t>(Trampoline) -
+                             (static_cast<int64_t>(Address) + 11);
   if (TrampolineOffset < MinOffset || TrampolineOffset > MaxOffset) {
     Report("XRay Tail Exit trampoline (%p) too far from sled (%p)\n",
-           reinterpret_cast<void *>(__xray_FunctionTailExit),
+           reinterpret_cast<void *>(Trampoline),
            reinterpret_cast<void *>(Address));
     return false;
   }
@@ -247,7 +248,8 @@ bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
 }
 
 bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
-                      const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+                      const XRaySledEntry &Sled,
+                      void (*Trampoline)()) XRAY_NEVER_INSTRUMENT {
   // Here we do the dance of replacing the following sled:
   //
   // xray_sled_n:
@@ -275,7 +277,8 @@ bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
 }
 
 bool patchTypedEvent(const bool Enable, const uint32_t FuncId,
-                      const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+                     const XRaySledEntry &Sled,
+                     void (*Trampoline)()) XRAY_NEVER_INSTRUMENT {
   // Here we do the dance of replacing the following sled:
   //
   // xray_sled_n:
diff --git a/compiler-rt/test/xray/TestCases/Posix/basic-mode-dso.cpp b/compiler-rt/test/xray/TestCases/Posix/basic-mode-dso.cpp
new file mode 100644
index 00000000000000..31c615bd1f81bf
--- /dev/null
+++ b/compiler-rt/test/xray/TestCases/Posix/basic-mode-dso.cpp
@@ -0,0 +1,47 @@
+// Testing shared library support in basic logging mode.
+
+// RUN: split-file %s %t
+// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlib.cpp -o %t/testlib.so
+// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -std=c++11 %t/main.cpp %t/testlib.so -Wl,-rpath,%t -o %t/main.o
+
+// RUN: XRAY_OPTIONS="patch_premain=false,xray_mode=xray-basic,xray_logfile_base=basic-mode-dso-,verbosity=1" XRAY_BASIC_OPTIONS="func_duration_threshold_us=0" %run %t/main.o 2>&1 | FileCheck %s
+// RUN: %llvm_xray account --format=csv --sort=funcid "`ls basic-mode-dso-* | head -1`" | FileCheck --check-prefix=ACCOUNT %s
+// RUN: rm basic-mode-dso-*
+
+// REQUIRES: target=x86_64{{.*}}
+
+//--- main.cpp
+
+#include "xray/xray_interface.h"
+
+#include <cstdio>
+#include <unistd.h>
+
+[[clang::xray_always_instrument]] void instrumented_in_executable() {
+  printf("instrumented_in_executable called\n");
+  sleep(1);
+}
+
+extern void instrumented_in_dso();
+
+int main() {
+  // Explicit patching to ensure the DSO has been loaded
+  __xray_patch();
+  instrumented_in_executable();
+  // CHECK: instrumented_in_executable called
+  instrumented_in_dso();
+  // CHECK-NEXT: instrumented_in_dso called
+}
+
+//--- testlib.cpp
+
+#include <cstdio>
+#include <unistd.h>
+
+[[clang::xray_always_instrument]] void instrumented_in_dso() {
+  printf("instrumented_in_dso called\n");
+}
+
+// ACCOUNT: funcid,count,min,median,90%ile,99%ile,max,sum,debug,function
+// ACCOUNT-NEXT: 1,1,{{.*}}
+// ACCOUNT-NEXT: 16777217,1,{{.*}}
diff --git a/compiler-rt/test/xray/TestCases/Posix/clang-xray-shared.cpp b/compiler-rt/test/xray/TestCases/Posix/clang-xray-shared.cpp
new file mode 100644
index 00000000000000..92f3c29e970d42
--- /dev/null
+++ b/compiler-rt/test/xray/TestCases/Posix/clang-xray-shared.cpp
@@ -0,0 +1,14 @@
+// Test that the DSO-local runtime library has been linked if -fxray-shared is passed.
+//
+// RUN: %clangxx -fxray-instrument -fxray-shared %s -shared -o %t.so
+// RUN: llvm-nm %t.so | FileCheck %s --check-prefix ENABLED
+
+// RUN: %clangxx -fxray-instrument %s -shared -o %t.so
+// RUN: llvm-nm %t.so | FileCheck %s --check-prefix DISABLED
+//
+// REQUIRES: target=x86_64{{.*}}
+
+[[clang::xray_always_instrument]] int always_instrumented() { return 42; }
+
+// ENABLED: __start_xray_instr_map
+// DISABLED-NOT: __start_xray_instr_map
diff --git a/compiler-rt/test/xray/TestCases/Posix/dlopen.cpp b/compiler-rt/test/xray/TestCases/Posix/dlopen.cpp
new file mode 100644
index 00000000000000..9db411d5ff1c6e
--- /dev/null
+++ b/compiler-rt/test/xray/TestCases/Posix/dlopen.cpp
@@ -0,0 +1,107 @@
+// Check that we can patch and un-patch DSOs loaded with dlopen.
+//
+
+// RUN: split-file %s %t
+// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlib.cpp -o %t/testlib.so
+// RUN: %clangxx_xray -g -fPIC -rdynamic -fxray-instrument -fxray-shared -std=c++11 %t/main.cpp -o %t/main.o
+//
+// RUN: XRAY_OPTIONS="patch_premain=true" %run %t/main.o %t/testlib.so 2>&1 | FileCheck %s
+
+// REQUIRES: target=x86_64{{.*}}
+
+//--- main.cpp
+
+#include "xray/xray_interface.h"
+
+#include <cstdio>
+#include <dlfcn.h>
+
+void test_handler(int32_t fid, XRayEntryType type) {
+  printf("called: %d, type=%d\n", fid, static_cast<int32_t>(type));
+}
+
+[[clang::xray_always_instrument]] void instrumented_in_executable() {
+  printf("instrumented_in_executable called\n");
+}
+
+typedef void (*dso_func_type)();
+
+int main(int argc, char **argv) {
+  if (argc < 2) {
+    printf("Shared library argument missing\n");
+    // CHECK-NOT: Shared library argument missing
+    return 1;
+  }
+
+  const char *dso_path = argv[1];
+
+  void *dso_handle = dlopen(dso_path, RTLD_LAZY);
+  if (!dso_handle) {
+    printf("Failed to load shared library\n");
+    char *error = dlerror();
+    if (error) {
+      fprintf(stderr, "%s\n", error);
+      return 1;
+    }
+    return 1;
+  }
+
+  dso_func_type instrumented_in_dso =
+      (dso_func_type)dlsym(dso_handle, "_Z19instrumented_in_dsov");
+  if (!instrumented_in_dso) {
+    printf("Failed to find symbol\n");
+    char *error = dlerror();
+    if (error) {
+      fprintf(stderr, "%s\n", error);
+      return 1;
+    }
+    return 1;
+  }
+
+  __xray_set_handler(test_handler);
+
+  instrumented_in_executable();
+  // CHECK: called: {{.*}}, type=0
+  // CHECK-NEXT: instrumented_in_executable called
+  // CHECK-NEXT: called: {{.*}}, type=1
+  instrumented_in_dso();
+  // CHECK-NEXT: called: {{.*}}, type=0
+  // CHECK-NEXT: instrumented_in_dso called
+  // CHECK-NEXT: called: {{.*}}, type=1
+
+  auto status = __xray_unpatch();
+  printf("unpatching status: %d\n", static_cast<int32_t>(status));
+  // CHECK-NEXT: unpatching status: 1
+
+  instrumented_in_executable();
+  // CHECK-NEXT: instrumented_in_executable called
+  instrumented_in_dso();
+  // CHECK-NEXT: instrumented_in_dso called
+
+  status = __xray_patch();
+  printf("patching status: %d\n", static_cast<int32_t>(status));
+  // CHECK-NEXT: patching status: 1
+
+  instrumented_in_executable();
+  // CHECK-NEXT: called: {{.*}}, type=0
+  // CHECK-NEXT: instrumented_in_executable called
+  // CHECK-NEXT: called: {{.*}}, type=1
+  instrumented_in_dso();
+  // CHECK-NEXT: called: {{.*}}, type=0
+  // CHECK-NEXT: instrumented_in_dso called
+  // CHECK-NEXT: called: {{.*}}, type=1
+
+  dlclose(dso_handle);
+
+  status = __xray_unpatch();
+  printf("unpatching status: %d\n", static_cast<int32_t>(status));
+  // CHECK-NEXT: unpatching status: 1
+}
+
+//--- testlib.cpp
+
+#include <cstdio>
+
+[[clang::xray_always_instrument]] void instrumented_in_dso() {
+  printf("instrumented_in_dso called\n");
+}
diff --git a/compiler-rt/test/xray/TestCases/Posix/dso-dep-chains.cpp b/compiler-rt/test/xray/TestCases/Posix/dso-dep-chains.cpp
new file mode 100644
index 00000000000000..89da2764c35cee
--- /dev/null
+++ b/compiler-rt/test/xray/TestCases/Posix/dso-dep-chains.cpp
@@ -0,0 +1,197 @@
+// Check that loading libraries with different modes (RTLD_LOCAL/RTLD_GLOBAL)
+// and dependencies on other DSOs work correctly.
+//
+
+// RUN: split-file %s %t
+//
+// Build shared libs with dependencies b->c and e->f
+// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testliba.cpp -o %t/testliba.so
+// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlibc.cpp -o %t/testlibc.so
+// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlibb.cpp %t/testlibc.so -o %t/testlibb.so
+// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlibd.cpp -o %t/testlibd.so
+// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlibf.cpp -o %t/testlibf.so
+// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlibe.cpp %t/testlibf.so -o %t/testlibe.so
+//
+// Executable links with a and b explicitly and loads d and e at runtime.
+// RUN: %clangxx_xray -g -fPIC -rdynamic -fxray-instrument -fxray-shared -std=c++11 %t/main.cpp %t/testliba.so %t/testlibb.so -o %t/main.o
+//
+// RUN:  XRAY_OPTIONS="patch_premain=true" %run %t/main.o %t/testlibd.so %t/testlibe.so 2>&1 | FileCheck %s
+
+// REQUIRES: target=x86_64{{.*}}
+
+//--- main.cpp
+
+#include "xray/xray_interface.h"
+
+#include <cstdio>
+#include <dlfcn.h>
+
+[[clang::xray_never_instrument]] void test_handler(int32_t fid,
+                                                   XRayEntryType type) {
+  printf("called: %d, object=%d, fn=%d, type=%d\n", fid, (fid >> 24) & 0xFF,
+         fid & 0x00FFFFFF, static_cast<int32_t>(type));
+}
+
+[[clang::xray_always_instrument]] void instrumented_in_executable() {
+  printf("instrumented_in_executable called\n");
+}
+
+typedef void (*dso_func_type)();
+
+[[clang::xray_never_instrument]] void *load_dso(const char *path, int mode) {
+  void *dso_handle = dlopen(path, mode);
+  if (!dso_handle) {
+    printf("failed to load shared library\n");
+    char *error = dlerror();
+    if (error) {
+      fprintf(stderr, "%s\n", error);
+    }
+    return nullptr;
+  }
+  return dso_handle;
+}
+
+[[clang::xray_never_instrument]] void find_and_call(void *dso_handle,
+                                                    const char *fn) {
+  dso_func_type dso_fn = (dso_func_type)dlsym(dso_handle, fn);
+  if (!dso_fn) {
+    printf("failed to find symbol\n");
+    char *error = dlerror();
+    if (error) {
+      fprintf(stderr, "%s\n", error);
+    }
+    return;
+  }
+  dso_fn();
+}
+
+extern void a();
+extern void b();
+
+int main(int argc, char **argv) {
+
+  if (argc < 3) {
+    printf("Shared library arguments missing\n");
+    // CHECK-NOT: Shared library arguments missing
+    return 1;
+  }
+
+  const char *dso_path_d = argv[1];
+  const char *dso_path_e = argv[2];
+
+  __xray_set_handler(test_handler);
+
+  instrumented_in_executable();
+  // CHECK: called: {{[0-9]+}}, object=0, fn={{[0-9]+}}, type=0
+  // CHECK-NEXT: instrumented_in_executable called
+  // CHECK-NEXT: called: {{[0-9]+}}, object=0, fn={{[0-9]+}}, type=1
+
+  a();
+  // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ1:[0-9]+]], fn=1, type=0
+  // CHECK-NEXT: a called
+  // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ1]], fn=1, type=1
+
+  // Make sure this object ID does not appear again
+  // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ1]]
+
+  b(); // b calls c
+  // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ2:[0-9]+]], fn=1, type=0
+  // CHECK-NEXT: b called
+  // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ3:[0-9]+]], fn=1, type=0
+  // CHECK-NEXT: c called
+  // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ3]], fn=1, type=1
+  // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ3]]
+  // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ2]], fn=1, type=1
+  // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ2]]
+
+  // Now check explicit loading with RTLD_LOCAL
+
+  void *dso_handle_d = load_dso(dso_path_d, RTLD_LAZY | RTLD_LOCAL);
+  void *dso_handle_e = load_dso(dso_path_e, RTLD_LAZY | RTLD_LOCAL);
+  // CHECK-NOT: failed to load shared library
+
+  find_and_call(dso_handle_d, "_Z1dv");
+  // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ4:[0-9]+]], fn=1, type=0
+  // CHECK-NEXT: d called
+  // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ4]], fn=1, type=1
+  // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ4]]
+
+  find_and_call(dso_handle_e, "_Z1ev");
+  // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ5:[0-9]+]], fn=1, type=0
+  // CHECK-NEXT: e called
+  // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ6:[0-9]+]], fn=1, type=0
+  // CHECK-NEXT: f called
+  // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ6]], fn=1, type=1
+  // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ6]]
+  // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ5]], fn=1, type=1
+  // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ5]]
+
+  // Unload DSOs
+  dlclose(dso_handle_d);
+  dlclose(dso_handle_e);
+
+  // Repeat test with RTLD_GLOBAL
+  dso_handle_d = load_dso(dso_path_d, RTLD_LAZY | RTLD_GLOBAL);
+  dso_handle_e = load_dso(dso_path_e, RTLD_LAZY | RTLD_GLOBAL);
+  // CHECK-NOT: failed to load shared library
+
+  find_and_call(dso_handle_d, "_Z1dv");
+  // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ7:[0-9]+]], fn=1, type=0
+  // CHECK-NEXT: d called
+  // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ7]], fn=1, type=1
+  // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ7]]
+
+  find_and_call(dso_handle_e, "_Z1ev");
+  // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ8:[0-9]+]], fn=1, type=0
+  // CHECK-NEXT: e called
+  // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ9:[0-9]+]], fn=1, type=0
+  // CHECK-NEXT: f called
+  // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ9]], fn=1, type=1
+  // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ9]]
+  // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ8]], fn=1, type=1
+  // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ8]]
+
+  auto status = __xray_unpatch();
+  printf("unpatching status: %d\n", static_cast<int32_t>(status));
+  // CHECK-NEXT: unpatching status: 1
+
+  dlclose(dso_handle_d);
+  dlclose(dso_handle_e);
+}
+
+//--- libgenmacro.inc
+#include <cstdio>
+// Helper macros to quickly generate libraries containing a single function.
+#define GENERATE_LIB(NAME)                                                     \
+  [[clang::xray_always_instrument]] void NAME() { printf(#NAME " called\n"); }
+
+#define GENERATE_LIB_WITH_CALL(NAME, FN)                                       \
+  extern void FN();                                                            \
+  [[clang::xray_always_instrument]] void NAME() {                              \
+    printf(#NAME " called\n");                                                 \
+    FN();                                                                      \
+  }
+
+//--- testliba.cpp
+#include "libgenmacro.inc"
+GENERATE_LIB(a)
+
+//--- testlibb.cpp
+#include "libgenmacro.inc"
+GENERATE_LIB_WITH_CALL(b, c)
+
+//--- testlibc.cpp
+#include "libgenmacro.inc"
+GENERATE_LIB(c)
+
+//--- testlibd.cpp
+#include "libgenmacro.inc"
+GENERATE_LIB(d)
+
+//--- testlibe.cpp
+#include "libgenmacro.inc"
+GENERATE_LIB_WITH_CALL(e, f)
+
+//--- testlibf.cpp
+#include "libgenmacro.inc"
+GENERATE_LIB(f)
diff --git a/compiler-rt/test/xray/TestCases/Posix/patch-premain-dso.cpp b/compiler-rt/test/xray/TestCases/Posix/patch-premain-dso.cpp
new file mode 100644
index 00000000000000..0708d0383439d0
--- /dev/null
+++ b/compiler-rt/test/xray/TestCases/Posix/patch-premain-dso.cpp
@@ -0,0 +1,45 @@
+// Checking that DSOs are automatically patched upon load, if patch_premain is passed.
+
+// RUN: split-file %s %t
+// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlib.cpp -o %t/testlib.so
+// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -std=c++11 %t/main.cpp %t/testlib.so -Wl,-rpath,%t -o %t/main.o
+
+// RUN: XRAY_OPTIONS="patch_premain=true,verbosity=1" %run %t/main.o 2>&1 | FileCheck %s
+
+// REQUIRES: target=x86_64{{.*}}
+
+//--- main.cpp
+
+#include "xray/xray_interface.h"
+
+#include <cstdio>
+
+void test_handler(int32_t fid, XRayEntryType type) {
+  printf("called: %d, type=%d\n", fid, static_cast<int32_t>(type));
+}
+
+[[clang::xray_always_instrument]] void instrumented_in_executable() {
+  printf("instrumented_in_executable called\n");
+}
+
+extern void instrumented_in_dso();
+
+int main() {
+  __xray_set_handler(test_handler);
+  instrumented_in_executable();
+  // CHECK: called: {{.*}}, type=0
+  // CHECK-NEXT: instrumented_in_executable called
+  // CHECK-NEXT: called: {{.*}}, type=1
+  instrumented_in_dso();
+  // CHECK-NEXT: called: {{.*}}, type=0
+  // CHECK-NEXT: instrumented_in_dso called
+  // CHECK-NEXT: called: {{.*}}, type=1
+}
+
+//--- testlib.cpp
+
+#include <cstdio>
+
+[[clang::xray_always_instrument]] void instrumented_in_dso() {
+  printf("instrumented_in_dso called\n");
+}
diff --git a/compiler-rt/test/xray/TestCases/Posix/patching-unpatching-dso.cpp b/compiler-rt/test/xray/TestCases/Posix/patching-unpatching-dso.cpp
new file mode 100644
index 00000000000000..d3e992dd497725
--- /dev/null
+++ b/compiler-rt/test/xray/TestCases/Posix/patching-unpatching-dso.cpp
@@ -0,0 +1,75 @@
+// Check that we can patch and un-patch on demand, and that logging gets invoked
+// appropriately.
+//
+
+// RUN: split-file %s %t
+// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlib.cpp -o %t/testlib.so
+// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -std=c++11 %t/main.cpp %t/testlib.so -Wl,-rpath,%t -o %t/main.o
+
+// RUN: XRAY_OPTIONS="patch_premain=false" %run %t/main.o 2>&1 | FileCheck %s
+
+// REQUIRES: target=x86_64{{.*}}
+
+//--- main.cpp
+
+#include "xray/xray_interface.h"
+
+#include <cstdio>
+
+bool called = false;
+
+void test_handler(int32_t fid, XRayEntryType type) {
+  printf("called: %d, type=%d\n", fid, static_cast<int32_t>(type));
+  called = true;
+}
+
+[[clang::xray_always_instrument]] void instrumented_in_executable() {
+  printf("instrumented_in_executable called\n");
+}
+
+extern void instrumented_in_dso();
+
+int main() {
+  __xray_set_handler(test_handler);
+  instrumented_in_executable();
+  // CHECK: instrumented_in_executable called
+  instrumented_in_dso();
+  // CHECK: instrumented_in_dso called
+  auto status = __xray_patch();
+  printf("patching status: %d\n", static_cast<int32_t>(status));
+  // CHECK-NEXT: patching status: 1
+  instrumented_in_executable();
+  // CHECK-NEXT: called: {{.*}}, type=0
+  // CHECK-NEXT: instrumented_in_executable called
+  // CHECK-NEXT: called: {{.*}}, type=1
+  instrumented_in_dso();
+  // CHECK-NEXT: called: {{.*}}, type=0
+  // CHECK-NEXT: instrumented_in_dso called
+  // CHECK-NEXT: called: {{.*}}, type=1
+  status = __xray_unpatch();
+  printf("patching status: %d\n", static_cast<int32_t>(status));
+  // CHECK-NEXT: patching status: 1
+  instrumented_in_executable();
+  // CHECK-NEXT: instrumented_in_executable called
+  instrumented_in_dso();
+  // CHECK-NEXT: instrumented_in_dso called
+  status = __xray_patch();
+  printf("patching status: %d\n", static_cast<int32_t>(status));
+  // CHECK-NEXT: patching status: 1
+  __xray_remove_handler();
+  instrumented_in_executable();
+  // CHECK-NEXT: instrumented_in_executable called
+  instrumented_in_dso();
+  // CHECK-NEXT: instrumented_in_dso called
+  status = __xray_unpatch();
+  printf("patching status: %d\n", static_cast<int32_t>(status));
+  // CHECK-NEXT: patching status: 1
+}
+
+//--- testlib.cpp
+
+#include <cstdio>
+
+[[clang::xray_always_instrument]] void instrumented_in_dso() {
+  printf("instrumented_in_dso called\n");
+}

From 42ec740d0347a89b656c9be5ac4a7e4d8bcd30d5 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Fri, 11 Oct 2024 11:36:55 +0200
Subject: [PATCH 144/177] [clang][ExprConstant] Remove an outdated TODO comment
 (#111959)

Seems like passing the quantities directly seems to work fine.
---
 clang/lib/AST/ExprConstant.cpp | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 4d5af96093cfeb..06e653f96d6de1 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -9768,11 +9768,8 @@ bool PointerExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
 
       if (BaseAlignment < Align) {
         Result.Designator.setInvalid();
-        // FIXME: Add support to Diagnostic for long / long long.
-        CCEDiag(E->getArg(0),
-                diag::note_constexpr_baa_insufficient_alignment) << 0
-          << (unsigned)BaseAlignment.getQuantity()
-          << (unsigned)Align.getQuantity();
+        CCEDiag(E->getArg(0), diag::note_constexpr_baa_insufficient_alignment)
+            << 0 << BaseAlignment.getQuantity() << Align.getQuantity();
         return false;
       }
     }
@@ -9783,11 +9780,11 @@ bool PointerExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
 
       (OffsetResult.Base
            ? CCEDiag(E->getArg(0),
-                     diag::note_constexpr_baa_insufficient_alignment) << 1
+                     diag::note_constexpr_baa_insufficient_alignment)
+                 << 1
            : CCEDiag(E->getArg(0),
                      diag::note_constexpr_baa_value_insufficient_alignment))
-        << (int)OffsetResult.Offset.getQuantity()
-        << (unsigned)Align.getQuantity();
+          << OffsetResult.Offset.getQuantity() << Align.getQuantity();
       return false;
     }
 

From 7b0d56be1d002e9cf0d8dda8ecaee99c5dbc88cf Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic@amd.com>
Date: Fri, 11 Oct 2024 11:40:27 +0200
Subject: [PATCH 145/177] AMDGPU/GlobalISel: Fix inst-selection of ballot
 (#109986)

Both input and output of ballot are lane-masks:
result is lane-mask with 'S32/S64 LLT and SGPR bank'
input is lane-mask with 'S1 LLT and VCC reg bank'.
Ballot copies bits from input lane-mask for
all active lanes and puts 0 for inactive lanes.
GlobalISel did not set 0 in result for inactive lanes
for non-constant input.
---
 llvm/docs/AMDGPUUsage.rst                     |   6 ++
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |   2 +
 .../AMDGPU/AMDGPUInstructionSelector.cpp      | 101 +++++++++++++-----
 .../GlobalISel/llvm.amdgcn.ballot.i32.ll      |  90 +++++++++++++++-
 .../GlobalISel/llvm.amdgcn.ballot.i64.ll      |  58 +++++++++-
 .../CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll  |  77 ++++++++++++-
 .../CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll  |  47 ++++++++
 .../AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll   |  20 ++--
 8 files changed, 360 insertions(+), 41 deletions(-)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 6ff3272422fe95..aba39762861dd8 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1369,6 +1369,12 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
                                                    sign-extended from the width of the underlying PC hardware register even on
                                                    processors where the s_getpc_b64 instruction returns a zero-extended value.
 
+  llvm.amdgcn.ballot                               Returns a bitfield(i32 or i64) containing the result of its i1 argument
+                                                   in all active lanes, and zero in all inactive lanes.
+                                                   Provides a way to convert i1 in LLVM IR to i32 or i64 lane mask - bitfield
+                                                   used by hardware to control active lanes when used in EXEC register.
+                                                   For example, ballot(i1 true) return EXEC mask.
+
   ==============================================   ==========================================================
 
 .. TODO::
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 2738eb77b675ab..715f2cc917e21c 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2086,6 +2086,8 @@ def int_amdgcn_fcmp :
             [IntrNoMem, IntrConvergent,
              ImmArg<ArgIndex<2>>, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
 
+// Returns a bitfield(i32 or i64) containing the result of its i1 argument
+// in all active lanes, and zero in all inactive lanes.
 def int_amdgcn_ballot :
   Intrinsic<[llvm_anyint_ty], [llvm_i1_ty],
             [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 5be0a049cc5827..53628981e12409 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1413,50 +1413,101 @@ bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
   return true;
 }
 
+// Ballot has to zero bits in input lane-mask that are zero in current exec,
+// Done as AND with exec. For inputs that are results of instruction that
+// implicitly use same exec, for example compares in same basic block or SCC to
+// VCC copy, use copy.
+static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI,
+                                    MachineBasicBlock *MBB) {
+  MachineInstr *MI = MRI.getVRegDef(Reg);
+  if (MI->getParent() != MBB)
+    return false;
+
+  // Lane mask generated by SCC to VCC copy.
+  if (MI->getOpcode() == AMDGPU::COPY) {
+    auto DstRB = MRI.getRegBankOrNull(MI->getOperand(0).getReg());
+    auto SrcRB = MRI.getRegBankOrNull(MI->getOperand(1).getReg());
+    if (DstRB && SrcRB && DstRB->getID() == AMDGPU::VCCRegBankID &&
+        SrcRB->getID() == AMDGPU::SGPRRegBankID)
+      return true;
+  }
+
+  // Lane mask generated using compare with same exec.
+  if (isa<GAnyCmp>(MI))
+    return true;
+
+  Register LHS, RHS;
+  // Look through AND.
+  if (mi_match(Reg, MRI, m_GAnd(m_Reg(LHS), m_Reg(RHS))))
+    return isLaneMaskFromSameBlock(LHS, MRI, MBB) ||
+           isLaneMaskFromSameBlock(RHS, MRI, MBB);
+
+  return false;
+}
+
 bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
   MachineBasicBlock *BB = I.getParent();
   const DebugLoc &DL = I.getDebugLoc();
   Register DstReg = I.getOperand(0).getReg();
-  const unsigned Size = MRI->getType(DstReg).getSizeInBits();
-  const bool Is64 = Size == 64;
-  const bool IsWave32 = (STI.getWavefrontSize() == 32);
+  Register SrcReg = I.getOperand(2).getReg();
+  const unsigned BallotSize = MRI->getType(DstReg).getSizeInBits();
+  const unsigned WaveSize = STI.getWavefrontSize();
 
   // In the common case, the return type matches the wave size.
   // However we also support emitting i64 ballots in wave32 mode.
-  if (Size != STI.getWavefrontSize() && (!Is64 || !IsWave32))
+  if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32))
     return false;
 
   std::optional<ValueAndVReg> Arg =
-      getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI);
+      getIConstantVRegValWithLookThrough(SrcReg, *MRI);
+
+  Register Dst = DstReg;
+  // i64 ballot on Wave32: new Dst(i32) for WaveSize ballot.
+  if (BallotSize != WaveSize) {
+    Dst = MRI->createVirtualRegister(TRI.getBoolRC());
+  }
 
-  const auto BuildCopy = [&](Register SrcReg) {
-    if (Size == STI.getWavefrontSize()) {
-      BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
-          .addReg(SrcReg);
-      return;
+  if (Arg) {
+    const int64_t Value = Arg->Value.getZExtValue();
+    if (Value == 0) {
+      // Dst = S_MOV 0
+      unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
+      BuildMI(*BB, &I, DL, TII.get(Opcode), Dst).addImm(0);
+    } else {
+      // Dst = COPY EXEC
+      assert(Value == 1);
+      BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(TRI.getExec());
     }
+    if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
+      return false;
+  } else {
+    if (isLaneMaskFromSameBlock(SrcReg, *MRI, BB)) {
+      // Dst = COPY SrcReg
+      BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(SrcReg);
+      if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
+        return false;
+    } else {
+      // Dst = S_AND SrcReg, EXEC
+      unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
+      auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), Dst)
+                     .addReg(SrcReg)
+                     .addReg(TRI.getExec())
+                     .setOperandDead(3); // Dead scc
+      if (!constrainSelectedInstRegOperands(*And, TII, TRI, RBI))
+        return false;
+    }
+  }
 
-    // If emitting a i64 ballot in wave32, fill the upper bits with zeroes.
+  // i64 ballot on Wave32: zero-extend i32 ballot to i64.
+  if (BallotSize != WaveSize) {
     Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
     BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
     BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
-        .addReg(SrcReg)
+        .addReg(Dst)
         .addImm(AMDGPU::sub0)
         .addReg(HiReg)
         .addImm(AMDGPU::sub1);
-  };
-
-  if (Arg) {
-    const int64_t Value = Arg->Value.getSExtValue();
-    if (Value == 0) {
-      unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
-      BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
-    } else if (Value == -1) // all ones
-      BuildCopy(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC);
-    else
-      return false;
-  } else
-    BuildCopy(I.getOperand(2).getReg());
+  }
 
   I.eraseFromParent();
   return true;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
index 96cab200b61cdb..2edcf23df411df 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX11 %s
 
 declare i32 @llvm.amdgcn.ballot.i32(i1)
 declare i32 @llvm.ctpop.i32(i32)
@@ -33,7 +33,8 @@ define amdgpu_cs i32 @non_compare(i32 %x) {
 ; CHECK-LABEL: non_compare:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s0, 0, v0
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; CHECK-NEXT:    s_and_b32 s0, vcc_lo, exec_lo
 ; CHECK-NEXT:    ; return to shader part epilog
   %trunc = trunc i32 %x to i1
   %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %trunc)
@@ -89,7 +90,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; CHECK-NEXT:    s_cmp_eq_u32 vcc_lo, 0
+; CHECK-NEXT:    s_and_b32 s0, vcc_lo, exec_lo
+; CHECK-NEXT:    s_cmp_eq_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB7_2
 ; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
@@ -137,7 +139,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; CHECK-NEXT:    s_cmp_lg_u32 vcc_lo, 0
+; CHECK-NEXT:    s_and_b32 s0, vcc_lo, exec_lo
+; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB9_2
 ; CHECK-NEXT:  ; %bb.1: ; %false
 ; CHECK-NEXT:    s_mov_b32 s0, 33
@@ -419,3 +422,80 @@ true:
 false:
   ret i32 33
 }
+
+; Input that is not constant or direct result of a compare.
+; Tests setting 0 to inactive lanes.
+define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid, i32 %cond) {
+; GFX10-LABEL: non_cst_non_compare_input:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_and_b32 s0, 1, s0
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
+; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX10-NEXT:  ; %bb.1: ; %B
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 2, v2
+; GFX10-NEXT:    s_andn2_b32 s0, s0, exec_lo
+; GFX10-NEXT:    ; implicit-def: $vgpr2
+; GFX10-NEXT:    s_and_b32 s2, exec_lo, vcc_lo
+; GFX10-NEXT:    s_or_b32 s0, s0, s2
+; GFX10-NEXT:  ; %bb.2: ; %Flow
+; GFX10-NEXT:    s_andn2_saveexec_b32 s1, s1
+; GFX10-NEXT:  ; %bb.3: ; %A
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 1, v2
+; GFX10-NEXT:    s_andn2_b32 s0, s0, exec_lo
+; GFX10-NEXT:    s_and_b32 s2, exec_lo, vcc_lo
+; GFX10-NEXT:    s_or_b32 s0, s0, s2
+; GFX10-NEXT:  ; %bb.4: ; %exit
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX10-NEXT:    s_and_b32 s0, s0, exec_lo
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
+; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: non_cst_non_compare_input:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_and_b32 s0, 1, s0
+; GFX11-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
+; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v3
+; GFX11-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX11-NEXT:  ; %bb.1: ; %B
+; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 2, v2
+; GFX11-NEXT:    s_and_not1_b32 s0, s0, exec_lo
+; GFX11-NEXT:    ; implicit-def: $vgpr2
+; GFX11-NEXT:    s_and_b32 s2, exec_lo, vcc_lo
+; GFX11-NEXT:    s_or_b32 s0, s0, s2
+; GFX11-NEXT:  ; %bb.2: ; %Flow
+; GFX11-NEXT:    s_and_not1_saveexec_b32 s1, s1
+; GFX11-NEXT:  ; %bb.3: ; %A
+; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 1, v2
+; GFX11-NEXT:    s_and_not1_b32 s0, s0, exec_lo
+; GFX11-NEXT:    s_and_b32 s2, exec_lo, vcc_lo
+; GFX11-NEXT:    s_or_b32 s0, s0, s2
+; GFX11-NEXT:  ; %bb.4: ; %exit
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_and_b32 s0, s0, exec_lo
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %cmp = icmp eq i32 %cond, 0
+  br i1 %cmp, label %A, label %B
+
+A:
+  %val_A = icmp uge i32 %tid, 1
+  br label %exit
+
+B:
+  %val_B = icmp ult i32 %tid, 2
+  br label %exit
+
+exit:
+  %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %phi)
+  store i32 %ballot, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
index a18f843440445c..0bbb40b8db43ab 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
@@ -34,7 +34,8 @@ define amdgpu_cs i64 @non_compare(i32 %x) {
 ; CHECK-LABEL: non_compare:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v0
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; CHECK-NEXT:    s_and_b64 s[0:1], vcc, exec
 ; CHECK-NEXT:    ; return to shader part epilog
   %trunc = trunc i32 %x to i1
   %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %trunc)
@@ -92,7 +93,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT:    s_cmp_eq_u64 vcc, 0
+; CHECK-NEXT:    s_and_b64 s[0:1], vcc, exec
+; CHECK-NEXT:    s_cmp_eq_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB7_2
 ; CHECK-NEXT:  ; %bb.1: ; %true
 ; CHECK-NEXT:    s_mov_b32 s0, 42
@@ -140,7 +142,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT:    s_cmp_lg_u64 vcc, 0
+; CHECK-NEXT:    s_and_b64 s[0:1], vcc, exec
+; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB9_2
 ; CHECK-NEXT:  ; %bb.1: ; %false
 ; CHECK-NEXT:    s_mov_b32 s0, 33
@@ -422,3 +425,52 @@ true:
 false:
   ret i32 33
 }
+
+; Input that is not constant or direct result of a compare.
+; Tests setting 0 to inactive lanes.
+define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid, i32 %cond) {
+; CHECK-LABEL: non_cst_non_compare_input:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_and_b32 s0, 1, s0
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
+; CHECK-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; CHECK-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; CHECK-NEXT:  ; %bb.1: ; %B
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 2, v2
+; CHECK-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; CHECK-NEXT:    s_and_b64 s[4:5], exec, vcc
+; CHECK-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; CHECK-NEXT:    ; implicit-def: $vgpr2
+; CHECK-NEXT:  ; %bb.2: ; %Flow
+; CHECK-NEXT:    s_andn2_saveexec_b64 s[2:3], s[2:3]
+; CHECK-NEXT:  ; %bb.3: ; %A
+; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, 1, v2
+; CHECK-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; CHECK-NEXT:    s_and_b64 s[4:5], exec, vcc
+; CHECK-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; CHECK-NEXT:  ; %bb.4: ; %exit
+; CHECK-NEXT:    s_or_b64 exec, exec, s[2:3]
+; CHECK-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; CHECK-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; CHECK-NEXT:    s_endpgm
+entry:
+  %cmp = icmp eq i32 %cond, 0
+  br i1 %cmp, label %A, label %B
+
+A:
+  %val_A = icmp uge i32 %tid, 1
+  br label %exit
+
+B:
+  %val_B = icmp ult i32 %tid, 2
+  br label %exit
+
+exit:
+  %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %phi)
+  store i64 %ballot, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
index 047b35b8c0f9d8..026a8d7da7080b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=CHECK,GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=CHECK,GFX11 %s
 
 declare i32 @llvm.amdgcn.ballot.i32(i1)
 declare i32 @llvm.ctpop.i32(i32)
@@ -522,3 +522,76 @@ true:
 false:
   ret i32 33
 }
+
+; Input that is not constant or direct result of a compare.
+; Tests setting 0 to inactive lanes.
+define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid, i32 %cond) {
+; GFX10-LABEL: non_cst_non_compare_input:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
+; GFX10-NEXT:    ; implicit-def: $sgpr0
+; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX10-NEXT:  ; %bb.1: ; %B
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 2, v2
+; GFX10-NEXT:    ; implicit-def: $vgpr2
+; GFX10-NEXT:    s_and_b32 s0, vcc_lo, exec_lo
+; GFX10-NEXT:  ; %bb.2: ; %Flow
+; GFX10-NEXT:    s_andn2_saveexec_b32 s1, s1
+; GFX10-NEXT:  ; %bb.3: ; %A
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX10-NEXT:    s_andn2_b32 s0, s0, exec_lo
+; GFX10-NEXT:    s_and_b32 s2, vcc_lo, exec_lo
+; GFX10-NEXT:    s_or_b32 s0, s0, s2
+; GFX10-NEXT:  ; %bb.4: ; %exit
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v2
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
+; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: non_cst_non_compare_input:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-NEXT:    ; implicit-def: $sgpr0
+; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v3
+; GFX11-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX11-NEXT:  ; %bb.1: ; %B
+; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 2, v2
+; GFX11-NEXT:    ; implicit-def: $vgpr2
+; GFX11-NEXT:    s_and_b32 s0, vcc_lo, exec_lo
+; GFX11-NEXT:  ; %bb.2: ; %Flow
+; GFX11-NEXT:    s_and_not1_saveexec_b32 s1, s1
+; GFX11-NEXT:  ; %bb.3: ; %A
+; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-NEXT:    s_and_not1_b32 s0, s0, exec_lo
+; GFX11-NEXT:    s_and_b32 s2, vcc_lo, exec_lo
+; GFX11-NEXT:    s_or_b32 s0, s0, s2
+; GFX11-NEXT:  ; %bb.4: ; %exit
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, v2
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %cmp = icmp eq i32 %cond, 0
+  br i1 %cmp, label %A, label %B
+
+A:
+  %val_A = icmp uge i32 %tid, 1
+  br label %exit
+
+B:
+  %val_B = icmp ult i32 %tid, 2
+  br label %exit
+
+exit:
+  %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
+  %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %phi)
+  store i32 %ballot, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
index 61f0f20f057043..c7597e98a6d583 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
@@ -511,3 +511,50 @@ true:
 false:
   ret i32 33
 }
+
+; Input that is not constant or direct result of a compare.
+; Tests setting 0 to inactive lanes.
+define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid, i32 %cond) {
+; CHECK-LABEL: non_cst_non_compare_input:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; CHECK-NEXT:    ; implicit-def: $sgpr0_sgpr1
+; CHECK-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; CHECK-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; CHECK-NEXT:  ; %bb.1: ; %B
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 2, v2
+; CHECK-NEXT:    s_and_b64 s[0:1], vcc, exec
+; CHECK-NEXT:    ; implicit-def: $vgpr2
+; CHECK-NEXT:  ; %bb.2: ; %Flow
+; CHECK-NEXT:    s_andn2_saveexec_b64 s[2:3], s[2:3]
+; CHECK-NEXT:  ; %bb.3: ; %A
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; CHECK-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; CHECK-NEXT:    s_and_b64 s[4:5], vcc, exec
+; CHECK-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; CHECK-NEXT:  ; %bb.4: ; %exit
+; CHECK-NEXT:    s_or_b64 exec, exec, s[2:3]
+; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
+; CHECK-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; CHECK-NEXT:    s_endpgm
+entry:
+  %cmp = icmp eq i32 %cond, 0
+  br i1 %cmp, label %A, label %B
+
+A:
+  %val_A = icmp uge i32 %tid, 1
+  br label %exit
+
+B:
+  %val_B = icmp ult i32 %tid, 2
+  br label %exit
+
+exit:
+  %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
+  %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %phi)
+  store i64 %ballot, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll
index 5dbfdf24ef36f7..fe69dc49062435 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll
@@ -40,12 +40,20 @@ define amdgpu_cs i64 @constant_true() {
 ; Test ballot of a non-comparison operation
 
 define amdgpu_cs i64 @non_compare(i32 %x) {
-; CHECK-LABEL: non_compare:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT:    s_mov_b32 s1, 0
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s0, 0, v0
-; CHECK-NEXT:    ; return to shader part epilog
+; DAGISEL-LABEL: non_compare:
+; DAGISEL:       ; %bb.0:
+; DAGISEL-NEXT:    v_and_b32_e32 v0, 1, v0
+; DAGISEL-NEXT:    s_mov_b32 s1, 0
+; DAGISEL-NEXT:    v_cmp_ne_u32_e64 s0, 0, v0
+; DAGISEL-NEXT:    ; return to shader part epilog
+;
+; GISEL-LABEL: non_compare:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
+; GISEL-NEXT:    s_mov_b32 s1, 0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GISEL-NEXT:    s_and_b32 s0, vcc_lo, exec_lo
+; GISEL-NEXT:    ; return to shader part epilog
   %trunc = trunc i32 %x to i1
   %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %trunc)
   ret i64 %ballot

From 777142937a599d8a9cea5964b415d9cd13016d79 Mon Sep 17 00:00:00 2001
From: Simon Camphausen <simon.camphausen@iml.fraunhofer.de>
Date: Fri, 11 Oct 2024 11:45:25 +0200
Subject: [PATCH 146/177] [mlir][EmitC] Fail on memrefs with 0 dims in type
 conversion (#111965)

This let's the type conversion fail instead of generating invalid array
types.
---
 .../Conversion/MemRefToEmitC/MemRefToEmitC.cpp   |  4 +++-
 .../MemRefToEmitC/memref-to-emitc-failed.mlir    | 16 ++++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitC.cpp b/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitC.cpp
index 2b7ac4b529cf0d..39532d34f616eb 100644
--- a/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitC.cpp
+++ b/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitC.cpp
@@ -167,7 +167,9 @@ void mlir::populateMemRefToEmitCTypeConversion(TypeConverter &typeConverter) {
   typeConverter.addConversion(
       [&](MemRefType memRefType) -> std::optional<Type> {
         if (!memRefType.hasStaticShape() ||
-            !memRefType.getLayout().isIdentity() || memRefType.getRank() == 0) {
+            !memRefType.getLayout().isIdentity() || memRefType.getRank() == 0 ||
+            llvm::any_of(memRefType.getShape(),
+                         [](int64_t dim) { return dim == 0; })) {
           return {};
         }
         Type convertedElementType =
diff --git a/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc-failed.mlir b/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc-failed.mlir
index dee9cc97a14493..fda01974d3fc85 100644
--- a/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc-failed.mlir
+++ b/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc-failed.mlir
@@ -41,6 +41,22 @@ func.func @zero_rank() {
 
 // -----
 
+func.func @zero_dim_rank_1() {
+  // expected-error@+1 {{failed to legalize operation 'memref.alloca'}}
+  %0 = memref.alloca() : memref<0xf32>
+  return
+}
+
+// -----
+
+func.func @zero_dim_rank_3() {
+  // expected-error@+1 {{failed to legalize operation 'memref.alloca'}}
+  %0 = memref.alloca() : memref<2x0x4xf32>
+  return
+}
+
+// -----
+
 // expected-error@+1 {{failed to legalize operation 'memref.global'}}
 memref.global "nested" constant @nested_global : memref<3x7xf32>
 

From 80c15c48d1fbb53478c9400e598abcbdcae0d962 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Fri, 11 Oct 2024 11:46:33 +0200
Subject: [PATCH 147/177] [clang][bytecode] Implement __builtin_assume_aligned
 (#111968)

---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp      | 71 ++++++++++++++++++-
 clang/lib/AST/ExprConstShared.h               |  8 +++
 clang/lib/AST/ExprConstant.cpp                | 35 +++++----
 clang/test/Sema/builtin-assume-aligned.c      |  2 +
 clang/test/SemaCXX/builtin-assume-aligned.cpp |  1 +
 5 files changed, 98 insertions(+), 19 deletions(-)

diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 74e9e1cf629372..ec27aebf84bd80 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -38,7 +38,6 @@ static T getParam(const InterpFrame *Frame, unsigned Index) {
   return Frame->getParam<T>(Offset);
 }
 
-// static APSInt getAPSIntParam(InterpStack &Stk, size_t Offset = 0) {
 static APSInt getAPSIntParam(const InterpFrame *Frame, unsigned Index) {
   APSInt R;
   unsigned Offset = Frame->getFunction()->getParamOffset(Index);
@@ -1162,6 +1161,71 @@ static bool interp__builtin_is_aligned_up_down(InterpState &S, CodePtr OpPC,
   return false;
 }
 
+/// __builtin_assume_aligned(Ptr, Alignment[, ExtraOffset])
+static bool interp__builtin_assume_aligned(InterpState &S, CodePtr OpPC,
+                                           const InterpFrame *Frame,
+                                           const Function *Func,
+                                           const CallExpr *Call) {
+  assert(Call->getNumArgs() == 2 || Call->getNumArgs() == 3);
+
+  // Might be called with function pointers in C.
+  std::optional<PrimType> PtrT = S.Ctx.classify(Call->getArg(0));
+  if (PtrT != PT_Ptr)
+    return false;
+
+  unsigned ArgSize = callArgSize(S, Call);
+  const Pointer &Ptr = S.Stk.peek<Pointer>(ArgSize);
+  std::optional<APSInt> ExtraOffset;
+  APSInt Alignment;
+  if (Call->getNumArgs() == 2) {
+    Alignment = peekToAPSInt(S.Stk, *S.Ctx.classify(Call->getArg(1)));
+  } else {
+    PrimType AlignmentT = *S.Ctx.classify(Call->getArg(1));
+    PrimType ExtraOffsetT = *S.Ctx.classify(Call->getArg(2));
+    Alignment = peekToAPSInt(S.Stk, *S.Ctx.classify(Call->getArg(1)),
+                             align(primSize(AlignmentT)) +
+                                 align(primSize(ExtraOffsetT)));
+    ExtraOffset = peekToAPSInt(S.Stk, *S.Ctx.classify(Call->getArg(2)));
+  }
+
+  CharUnits Align = CharUnits::fromQuantity(Alignment.getZExtValue());
+
+  // If there is a base object, then it must have the correct alignment.
+  if (Ptr.isBlockPointer()) {
+    CharUnits BaseAlignment;
+    if (const auto *VD = Ptr.getDeclDesc()->asValueDecl())
+      BaseAlignment = S.getASTContext().getDeclAlign(VD);
+    else if (const auto *E = Ptr.getDeclDesc()->asExpr())
+      BaseAlignment = GetAlignOfExpr(S.getASTContext(), E, UETT_AlignOf);
+
+    if (BaseAlignment < Align) {
+      S.CCEDiag(Call->getArg(0),
+                diag::note_constexpr_baa_insufficient_alignment)
+          << 0 << BaseAlignment.getQuantity() << Align.getQuantity();
+      return false;
+    }
+  }
+
+  APValue AV = Ptr.toAPValue(S.getASTContext());
+  CharUnits AVOffset = AV.getLValueOffset();
+  if (ExtraOffset)
+    AVOffset -= CharUnits::fromQuantity(ExtraOffset->getZExtValue());
+  if (AVOffset.alignTo(Align) != AVOffset) {
+    if (Ptr.isBlockPointer())
+      S.CCEDiag(Call->getArg(0),
+                diag::note_constexpr_baa_insufficient_alignment)
+          << 1 << AVOffset.getQuantity() << Align.getQuantity();
+    else
+      S.CCEDiag(Call->getArg(0),
+                diag::note_constexpr_baa_value_insufficient_alignment)
+          << AVOffset.getQuantity() << Align.getQuantity();
+    return false;
+  }
+
+  S.Stk.push<Pointer>(Ptr);
+  return true;
+}
+
 static bool interp__builtin_ia32_bextr(InterpState &S, CodePtr OpPC,
                                        const InterpFrame *Frame,
                                        const Function *Func,
@@ -1905,6 +1969,11 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const Function *F,
       return false;
     break;
 
+  case Builtin::BI__builtin_assume_aligned:
+    if (!interp__builtin_assume_aligned(S, OpPC, Frame, F, Call))
+      return false;
+    break;
+
   case clang::X86::BI__builtin_ia32_bextr_u32:
   case clang::X86::BI__builtin_ia32_bextr_u64:
   case clang::X86::BI__builtin_ia32_bextri_u32:
diff --git a/clang/lib/AST/ExprConstShared.h b/clang/lib/AST/ExprConstShared.h
index efe8ee986d29b3..401ae629c86bfd 100644
--- a/clang/lib/AST/ExprConstShared.h
+++ b/clang/lib/AST/ExprConstShared.h
@@ -14,12 +14,17 @@
 #ifndef LLVM_CLANG_LIB_AST_EXPRCONSTSHARED_H
 #define LLVM_CLANG_LIB_AST_EXPRCONSTSHARED_H
 
+#include "clang/Basic/TypeTraits.h"
+
 namespace llvm {
 class APFloat;
 }
 namespace clang {
 class QualType;
 class LangOptions;
+class ASTContext;
+class CharUnits;
+class Expr;
 } // namespace clang
 using namespace clang;
 /// Values returned by __builtin_classify_type, chosen to match the values
@@ -66,4 +71,7 @@ void HandleComplexComplexDiv(llvm::APFloat A, llvm::APFloat B, llvm::APFloat C,
                              llvm::APFloat D, llvm::APFloat &ResR,
                              llvm::APFloat &ResI);
 
+CharUnits GetAlignOfExpr(const ASTContext &Ctx, const Expr *E,
+                         UnaryExprOrTypeTrait ExprKind);
+
 #endif
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 06e653f96d6de1..70b223596d8b9b 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -9620,7 +9620,7 @@ bool PointerExprEvaluator::VisitCastExpr(const CastExpr *E) {
   return ExprEvaluatorBaseTy::VisitCastExpr(E);
 }
 
-static CharUnits GetAlignOfType(EvalInfo &Info, QualType T,
+static CharUnits GetAlignOfType(const ASTContext &Ctx, QualType T,
                                 UnaryExprOrTypeTrait ExprKind) {
   // C++ [expr.alignof]p3:
   //     When alignof is applied to a reference type, the result is the
@@ -9631,23 +9631,22 @@ static CharUnits GetAlignOfType(EvalInfo &Info, QualType T,
     return CharUnits::One();
 
   const bool AlignOfReturnsPreferred =
-      Info.Ctx.getLangOpts().getClangABICompat() <= LangOptions::ClangABI::Ver7;
+      Ctx.getLangOpts().getClangABICompat() <= LangOptions::ClangABI::Ver7;
 
   // __alignof is defined to return the preferred alignment.
   // Before 8, clang returned the preferred alignment for alignof and _Alignof
   // as well.
   if (ExprKind == UETT_PreferredAlignOf || AlignOfReturnsPreferred)
-    return Info.Ctx.toCharUnitsFromBits(
-      Info.Ctx.getPreferredTypeAlign(T.getTypePtr()));
+    return Ctx.toCharUnitsFromBits(Ctx.getPreferredTypeAlign(T.getTypePtr()));
   // alignof and _Alignof are defined to return the ABI alignment.
   else if (ExprKind == UETT_AlignOf)
-    return Info.Ctx.getTypeAlignInChars(T.getTypePtr());
+    return Ctx.getTypeAlignInChars(T.getTypePtr());
   else
     llvm_unreachable("GetAlignOfType on a non-alignment ExprKind");
 }
 
-static CharUnits GetAlignOfExpr(EvalInfo &Info, const Expr *E,
-                                UnaryExprOrTypeTrait ExprKind) {
+CharUnits GetAlignOfExpr(const ASTContext &Ctx, const Expr *E,
+                         UnaryExprOrTypeTrait ExprKind) {
   E = E->IgnoreParens();
 
   // The kinds of expressions that we have special-case logic here for
@@ -9657,22 +9656,22 @@ static CharUnits GetAlignOfExpr(EvalInfo &Info, const Expr *E,
   // alignof decl is always accepted, even if it doesn't make sense: we default
   // to 1 in those cases.
   if (const DeclRefExpr *DRE = dyn_cast<DeclRefExpr>(E))
-    return Info.Ctx.getDeclAlign(DRE->getDecl(),
-                                 /*RefAsPointee*/true);
+    return Ctx.getDeclAlign(DRE->getDecl(),
+                            /*RefAsPointee*/ true);
 
   if (const MemberExpr *ME = dyn_cast<MemberExpr>(E))
-    return Info.Ctx.getDeclAlign(ME->getMemberDecl(),
-                                 /*RefAsPointee*/true);
+    return Ctx.getDeclAlign(ME->getMemberDecl(),
+                            /*RefAsPointee*/ true);
 
-  return GetAlignOfType(Info, E->getType(), ExprKind);
+  return GetAlignOfType(Ctx, E->getType(), ExprKind);
 }
 
 static CharUnits getBaseAlignment(EvalInfo &Info, const LValue &Value) {
   if (const auto *VD = Value.Base.dyn_cast<const ValueDecl *>())
     return Info.Ctx.getDeclAlign(VD);
   if (const auto *E = Value.Base.dyn_cast<const Expr *>())
-    return GetAlignOfExpr(Info, E, UETT_AlignOf);
-  return GetAlignOfType(Info, Value.Base.getTypeInfoType(), UETT_AlignOf);
+    return GetAlignOfExpr(Info.Ctx, E, UETT_AlignOf);
+  return GetAlignOfType(Info.Ctx, Value.Base.getTypeInfoType(), UETT_AlignOf);
 }
 
 /// Evaluate the value of the alignment argument to __builtin_align_{up,down},
@@ -14475,11 +14474,11 @@ bool IntExprEvaluator::VisitUnaryExprOrTypeTraitExpr(
   case UETT_PreferredAlignOf:
   case UETT_AlignOf: {
     if (E->isArgumentType())
-      return Success(GetAlignOfType(Info, E->getArgumentType(), E->getKind()),
-                     E);
+      return Success(
+          GetAlignOfType(Info.Ctx, E->getArgumentType(), E->getKind()), E);
     else
-      return Success(GetAlignOfExpr(Info, E->getArgumentExpr(), E->getKind()),
-                     E);
+      return Success(
+          GetAlignOfExpr(Info.Ctx, E->getArgumentExpr(), E->getKind()), E);
   }
 
   case UETT_PtrAuthTypeDiscriminator: {
diff --git a/clang/test/Sema/builtin-assume-aligned.c b/clang/test/Sema/builtin-assume-aligned.c
index c2e4f9d659dd4d..33e85578451529 100644
--- a/clang/test/Sema/builtin-assume-aligned.c
+++ b/clang/test/Sema/builtin-assume-aligned.c
@@ -1,5 +1,7 @@
 // RUN: %clang_cc1 -DSIZE_T_64 -fsyntax-only -Wno-strict-prototypes -triple x86_64-linux -verify %s
 // RUN: %clang_cc1 -fsyntax-only -Wno-strict-prototypes -triple i386-freebsd -verify %s
+// RUN: %clang_cc1 -DSIZE_T_64 -fsyntax-only -Wno-strict-prototypes -triple x86_64-linux -verify %s -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 -fsyntax-only -Wno-strict-prototypes -triple i386-freebsd -verify %s -fexperimental-new-constant-interpreter
 
 // __builtin_assume_aligned's second parameter is size_t, which may be 32 bits,
 // so test differently when size_t is 32 bits and when it is 64 bits.
diff --git a/clang/test/SemaCXX/builtin-assume-aligned.cpp b/clang/test/SemaCXX/builtin-assume-aligned.cpp
index 48bd8414fc50a1..85a7faee916181 100644
--- a/clang/test/SemaCXX/builtin-assume-aligned.cpp
+++ b/clang/test/SemaCXX/builtin-assume-aligned.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 -triple x86_64-linux-gnu %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 -triple x86_64-linux-gnu %s -fexperimental-new-constant-interpreter
 
 int n;
 constexpr int *p = 0;

From 73ad416ebf9d11b876f22ede0ee90f660192869f Mon Sep 17 00:00:00 2001
From: Dominik Adamski <dominik.adamski@amd.com>
Date: Fri, 11 Oct 2024 11:53:28 +0200
Subject: [PATCH 148/177] [OpenMP][Flang] Enable alias analysis inside omp
 target region (#111670)

At present, alias analysis does not work for operations inside OMP
target regions because the FIR declare operations within OMP target do
not offer sufficient information for alias analysis. Consequently, it is
necessary to examine the FIR code outside the OMP target region.
---
 .../lib/Optimizer/Analysis/AliasAnalysis.cpp  | 29 ++++++
 flang/lib/Optimizer/Analysis/CMakeLists.txt   |  2 +
 .../alias-analysis-omp-target-1.fir           | 66 +++++++++++++
 .../alias-analysis-omp-target-2.fir           | 96 +++++++++++++++++++
 4 files changed, 193 insertions(+)
 create mode 100644 flang/test/Analysis/AliasAnalysis/alias-analysis-omp-target-1.fir
 create mode 100644 flang/test/Analysis/AliasAnalysis/alias-analysis-omp-target-2.fir

diff --git a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp
index e88da5a8ebae19..6ee4f0ff71057a 100644
--- a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp
+++ b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp
@@ -13,6 +13,8 @@
 #include "flang/Optimizer/Dialect/FortranVariableInterface.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "mlir/Analysis/AliasAnalysis.h"
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+#include "mlir/Dialect/OpenMP/OpenMPInterfaces.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Value.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
@@ -296,6 +298,17 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v,
             defOp = v.getDefiningOp();
             return;
           }
+          // If load is inside target and it points to mapped item,
+          // continue tracking.
+          Operation *loadMemrefOp = op.getMemref().getDefiningOp();
+          bool isDeclareOp = llvm::isa<fir::DeclareOp>(loadMemrefOp) ||
+                             llvm::isa<hlfir::DeclareOp>(loadMemrefOp);
+          if (isDeclareOp &&
+              llvm::isa<omp::TargetOp>(loadMemrefOp->getParentOp())) {
+            v = op.getMemref();
+            defOp = v.getDefiningOp();
+            return;
+          }
           // No further tracking for addresses loaded from memory for now.
           type = SourceKind::Indirect;
           breakFromLoop = true;
@@ -319,6 +332,22 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v,
           breakFromLoop = true;
         })
         .Case<hlfir::DeclareOp, fir::DeclareOp>([&](auto op) {
+          // If declare operation is inside omp target region,
+          // continue alias analysis outside the target region
+          if (auto targetOp =
+                  llvm::dyn_cast<omp::TargetOp>(op->getParentOp())) {
+            auto argIface = cast<omp::BlockArgOpenMPOpInterface>(*targetOp);
+            for (auto [opArg, blockArg] : llvm::zip_equal(
+                     targetOp.getMapVars(), argIface.getMapBlockArgs())) {
+              if (blockArg == op.getMemref()) {
+                omp::MapInfoOp mapInfo =
+                    llvm::cast<omp::MapInfoOp>(opArg.getDefiningOp());
+                v = mapInfo.getVarPtr();
+                defOp = v.getDefiningOp();
+                return;
+              }
+            }
+          }
           auto varIf = llvm::cast<fir::FortranVariableOpInterface>(defOp);
           // While going through a declare operation collect
           // the variable attributes from it. Right now, some
diff --git a/flang/lib/Optimizer/Analysis/CMakeLists.txt b/flang/lib/Optimizer/Analysis/CMakeLists.txt
index 436d4d3f18969c..c000a9da99f871 100644
--- a/flang/lib/Optimizer/Analysis/CMakeLists.txt
+++ b/flang/lib/Optimizer/Analysis/CMakeLists.txt
@@ -6,6 +6,7 @@ add_flang_library(FIRAnalysis
   FIRDialect
   HLFIRDialect
   MLIRIR
+  MLIROpenMPDialect
 
   LINK_LIBS
   FIRBuilder
@@ -14,5 +15,6 @@ add_flang_library(FIRAnalysis
   MLIRFuncDialect
   MLIRLLVMDialect
   MLIRMathTransforms
+  MLIROpenMPDialect
   FIRSupport
 )
diff --git a/flang/test/Analysis/AliasAnalysis/alias-analysis-omp-target-1.fir b/flang/test/Analysis/AliasAnalysis/alias-analysis-omp-target-1.fir
new file mode 100644
index 00000000000000..88f411847172a0
--- /dev/null
+++ b/flang/test/Analysis/AliasAnalysis/alias-analysis-omp-target-1.fir
@@ -0,0 +1,66 @@
+// Use --mlir-disable-threading so that the AA queries are serialized
+// as well as its diagnostic output.
+// RUN: fir-opt %s -pass-pipeline='builtin.module(func.func(test-fir-alias-analysis))' -split-input-file --mlir-disable-threading 2>&1 | FileCheck %s
+
+// Fortran source code:
+//
+// program TestAllocatableArray
+// real(kind=8),  allocatable :: A(:)
+// real(kind=8),  allocatable :: B(:)
+// !$omp target
+//    A(0) = B(0)
+// !$omp end target
+// end TestAllocatableArray
+
+// CHECK-LABEL: Testing : "_QPTestAllocatableArray"
+// CHECK-DAG: targetArrayB#0 <-> targetArrayA#0: NoAlias
+func.func @_QPTestAllocatableArray() {
+  %0 = fir.address_of(@_QFEa) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>
+  %1:2 = hlfir.declare %0 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "ArrayA" } : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>)
+  %2 = fir.address_of(@_QFEb) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>
+  %3:2 = hlfir.declare %2 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "ArrayB" } : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>)
+  %4 = fir.load %1#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %5 = fir.load %1#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>
+  %c0_0 = arith.constant 0 : index
+  %6:3 = fir.box_dims %5, %c0_0 : (!fir.box<!fir.heap<!fir.array<?xf64>>>, index) -> (index, index, index)
+  %7:3 = fir.box_dims %4, %c0 : (!fir.box<!fir.heap<!fir.array<?xf64>>>, index) -> (index, index, index)
+  %c0_1 = arith.constant 0 : index
+  %8 = arith.subi %7#1, %c1 : index
+  %9 = omp.map.bounds lower_bound(%c0_1 : index) upper_bound(%8 : index) extent(%7#1 : index) stride(%7#2 : index) start_idx(%6#0 : index) {stride_in_bytes = true}
+  %10 = fir.box_offset %1#1 base_addr : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xf64>>>
+  %11 = omp.map.info var_ptr(%1#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.array<?xf64>) var_ptr_ptr(%10 : !fir.llvm_ptr<!fir.ref<!fir.array<?xf64>>>) map_clauses(implicit, tofrom) capture(ByRef) bounds(%9) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xf64>>> {name = ""}
+  %12 = omp.map.info var_ptr(%1#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.box<!fir.heap<!fir.array<?xf64>>>) map_clauses(implicit, tofrom) capture(ByRef) members(%11 : [0] : !fir.llvm_ptr<!fir.ref<!fir.array<?xf64>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>> {name = "a"}
+  %13 = fir.load %3#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>
+  %c1_2 = arith.constant 1 : index
+  %c0_3 = arith.constant 0 : index
+  %14 = fir.load %3#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>
+  %c0_4 = arith.constant 0 : index
+  %15:3 = fir.box_dims %14, %c0_4 : (!fir.box<!fir.heap<!fir.array<?xf64>>>, index) -> (index, index, index)
+  %16:3 = fir.box_dims %13, %c0_3 : (!fir.box<!fir.heap<!fir.array<?xf64>>>, index) -> (index, index, index)
+  %c0_5 = arith.constant 0 : index
+  %17 = arith.subi %16#1, %c1_2 : index
+  %18 = omp.map.bounds lower_bound(%c0_5 : index) upper_bound(%17 : index) extent(%16#1 : index) stride(%16#2 : index) start_idx(%15#0 : index) {stride_in_bytes = true}
+  %19 = fir.box_offset %3#1 base_addr : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xf64>>>
+  %20 = omp.map.info var_ptr(%3#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.array<?xf64>) var_ptr_ptr(%19 : !fir.llvm_ptr<!fir.ref<!fir.array<?xf64>>>) map_clauses(implicit, tofrom) capture(ByRef) bounds(%18) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xf64>>> {name = ""}
+  %21 = omp.map.info var_ptr(%3#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.box<!fir.heap<!fir.array<?xf64>>>) map_clauses(implicit, tofrom) capture(ByRef) members(%20 : [0] : !fir.llvm_ptr<!fir.ref<!fir.array<?xf64>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>> {name = "b"}
+  omp.target map_entries(%11 -> %arg0, %12 -> %arg1, %20 -> %arg2, %21 -> %arg3 : !fir.llvm_ptr<!fir.ref<!fir.array<?xf64>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xf64>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) {
+    %22:2 = hlfir.declare %arg1 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>)
+    %23:2 = hlfir.declare %arg3 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFEb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>)
+    %24 = fir.load %23#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>
+    %c0_6 = arith.constant 0 : index
+    %25 = hlfir.designate %24 (%c0_6) {test.ptr = "targetArrayB"} : (!fir.box<!fir.heap<!fir.array<?xf64>>>, index) -> !fir.ref<f64>
+    %26 = fir.load %25 : !fir.ref<f64>
+    %27 = fir.load %22#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>
+    %c0_7 = arith.constant 0 : index
+    %28 = hlfir.designate %27 (%c0_7) {test.ptr = "targetArrayA"} : (!fir.box<!fir.heap<!fir.array<?xf64>>>, index) -> !fir.ref<f64>
+    hlfir.assign %26 to %28 : f64, !fir.ref<f64>
+    omp.terminator
+  }
+  return
+}
+fir.global internal @_QFEa : !fir.box<!fir.heap<!fir.array<?xf64>>> {
+}
+fir.global internal @_QFEb : !fir.box<!fir.heap<!fir.array<?xf64>>> {
+}
diff --git a/flang/test/Analysis/AliasAnalysis/alias-analysis-omp-target-2.fir b/flang/test/Analysis/AliasAnalysis/alias-analysis-omp-target-2.fir
new file mode 100644
index 00000000000000..c6b2e29a7188a9
--- /dev/null
+++ b/flang/test/Analysis/AliasAnalysis/alias-analysis-omp-target-2.fir
@@ -0,0 +1,96 @@
+// Use --mlir-disable-threading so that the AA queries are serialized
+// as well as its diagnostic output.
+// RUN: fir-opt %s -pass-pipeline='builtin.module(func.func(test-fir-alias-analysis))' -split-input-file --mlir-disable-threading 2>&1 | FileCheck %s
+
+// Fortran source code:
+//
+// subroutine TestTargetData(p, a, b)
+//   real    ::  p(10), a(10), b(10)
+//   !$omp target data map(from: p)
+//      !$omp target map(to: a )
+//            p(1) = a(1)
+//      !$omp end target
+//      !$omp target map(to: b )
+//            p(1) = b(1)
+//      !$omp end target
+//   !$omp end target data
+// end subroutine
+
+// CHECK-LABEL: Testing : "_QPTestTargetData"
+
+// CHECK-DAG: targetArrayA#0 <-> targetArrayP#0: NoAlias
+// CHECK-DAG: targetArrayA#0 <-> targetArrayB#0: NoAlias
+// CHECK-DAG: targetArrayP#0 <-> targetArrayB#0: NoAlias
+
+func.func @_QPTestTargetData(%arg0: !fir.ref<!fir.array<10xf32>> {fir.bindc_name = "p"}, %arg1: !fir.ref<!fir.array<10xf32>> {fir.bindc_name = "a"}, %arg2: !fir.ref<!fir.array<10xf32>> {fir.bindc_name = "b"}) {
+  %0 = fir.dummy_scope : !fir.dscope
+  %c10 = arith.constant 10 : index
+  %1 = fir.shape %c10 : (index) -> !fir.shape<1>
+  %2:2 = hlfir.declare %arg1(%1) dummy_scope %0 {uniq_name = "_QFtest_target_dataEa"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
+  %c10_0 = arith.constant 10 : index
+  %3 = fir.shape %c10_0 : (index) -> !fir.shape<1>
+  %4:2 = hlfir.declare %arg2(%3) dummy_scope %0 {uniq_name = "_QFtest_target_dataEb"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
+  %c10_1 = arith.constant 10 : index
+  %5 = fir.shape %c10_1 : (index) -> !fir.shape<1>
+  %6:2 = hlfir.declare %arg0(%5) dummy_scope %0 {uniq_name = "_QFtest_target_dataEp"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %7 = arith.subi %c10_1, %c1 : index
+  %8 = omp.map.bounds lower_bound(%c0 : index) upper_bound(%7 : index) extent(%c10_1 : index) stride(%c1 : index) start_idx(%c1 : index)
+  %9 = omp.map.info var_ptr(%6#1 : !fir.ref<!fir.array<10xf32>>, !fir.array<10xf32>) map_clauses(from) capture(ByRef) bounds(%8) -> !fir.ref<!fir.array<10xf32>> {name = "p"}
+  omp.target_data map_entries(%9 : !fir.ref<!fir.array<10xf32>>) {
+    %c1_2 = arith.constant 1 : index
+    %c0_3 = arith.constant 0 : index
+    %10 = arith.subi %c10, %c1_2 : index
+    %11 = omp.map.bounds lower_bound(%c0_3 : index) upper_bound(%10 : index) extent(%c10 : index) stride(%c1_2 : index) start_idx(%c1_2 : index)
+    %12 = omp.map.info var_ptr(%2#1 : !fir.ref<!fir.array<10xf32>>, !fir.array<10xf32>) map_clauses(to) capture(ByRef) bounds(%11) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
+    %c1_4 = arith.constant 1 : index
+    %c0_5 = arith.constant 0 : index
+    %13 = arith.subi %c10_1, %c1_4 : index
+    %14 = omp.map.bounds lower_bound(%c0_5 : index) upper_bound(%13 : index) extent(%c10_1 : index) stride(%c1_4 : index) start_idx(%c1_4 : index)
+    %15 = omp.map.info var_ptr(%6#1 : !fir.ref<!fir.array<10xf32>>, !fir.array<10xf32>) map_clauses(implicit, tofrom) capture(ByRef) bounds(%14) -> !fir.ref<!fir.array<10xf32>> {name = "p"}
+    omp.target map_entries(%12 -> %arg3, %15 -> %arg4 : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
+      %c10_10 = arith.constant 10 : index
+      %22 = fir.shape %c10_10 : (index) -> !fir.shape<1>
+      %23:2 = hlfir.declare %arg3(%22) {uniq_name = "_QFtest_target_dataEa"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
+      %c10_11 = arith.constant 10 : index
+      %24 = fir.shape %c10_11 : (index) -> !fir.shape<1>
+      %25:2 = hlfir.declare %arg4(%24) {uniq_name = "_QFtest_target_dataEp"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
+      %c1_12 = arith.constant 1 : index
+      %26 = hlfir.designate %23#0 (%c1_12) {test.ptr = "targetArrayA"}  : (!fir.ref<!fir.array<10xf32>>, index) -> !fir.ref<f32>
+      %27 = fir.load %26 : !fir.ref<f32>
+      %c1_13 = arith.constant 1 : index
+      %28 = hlfir.designate %25#0 (%c1_13) {test.ptr = "targetArrayP"}  : (!fir.ref<!fir.array<10xf32>>, index) -> !fir.ref<f32>
+      hlfir.assign %27 to %28 : f32, !fir.ref<f32>
+      omp.terminator
+    }
+    %c1_6 = arith.constant 1 : index
+    %c0_7 = arith.constant 0 : index
+    %16 = arith.subi %c10_0, %c1_6 : index
+    %17 = omp.map.bounds lower_bound(%c0_7 : index) upper_bound(%16 : index) extent(%c10_0 : index) stride(%c1_6 : index) start_idx(%c1_6 : index)
+    %18 = omp.map.info var_ptr(%4#1 : !fir.ref<!fir.array<10xf32>>, !fir.array<10xf32>) map_clauses(to) capture(ByRef) bounds(%17) -> !fir.ref<!fir.array<10xf32>> {name = "b"}
+    %c1_8 = arith.constant 1 : index
+    %c0_9 = arith.constant 0 : index
+    %19 = arith.subi %c10_1, %c1_8 : index
+    %20 = omp.map.bounds lower_bound(%c0_9 : index) upper_bound(%19 : index) extent(%c10_1 : index) stride(%c1_8 : index) start_idx(%c1_8 : index)
+    %21 = omp.map.info var_ptr(%6#1 : !fir.ref<!fir.array<10xf32>>, !fir.array<10xf32>) map_clauses(implicit, tofrom) capture(ByRef) bounds(%20) -> !fir.ref<!fir.array<10xf32>> {name = "p"}
+    omp.target map_entries(%18 -> %arg3, %21 -> %arg4 : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
+      %c10_10 = arith.constant 10 : index
+      %22 = fir.shape %c10_10 : (index) -> !fir.shape<1>
+      %23:2 = hlfir.declare %arg3(%22) {uniq_name = "_QFtest_target_dataEb"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
+      %c10_11 = arith.constant 10 : index
+      %24 = fir.shape %c10_11 : (index) -> !fir.shape<1>
+      %25:2 = hlfir.declare %arg4(%24) {uniq_name = "_QFtest_target_dataEp"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
+      %c1_12 = arith.constant 1 : index
+      %26 = hlfir.designate %23#0 (%c1_12) {test.ptr = "targetArrayB"}  : (!fir.ref<!fir.array<10xf32>>, index) -> !fir.ref<f32>
+      %27 = fir.load %26 : !fir.ref<f32>
+      %c1_13 = arith.constant 1 : index
+      %28 = hlfir.designate %25#0 (%c1_13) {test.ptr = "targetArrayP"}  : (!fir.ref<!fir.array<10xf32>>, index) -> !fir.ref<f32>
+      hlfir.assign %27 to %28 : f32, !fir.ref<f32>
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
+

From f74f568b29885c3fa63c44e33f91f3bb7281138e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bal=C3=A1zs=20K=C3=A9ri?= <balazs.keri@ericsson.com>
Date: Fri, 11 Oct 2024 11:58:14 +0200
Subject: [PATCH 149/177] [clang][analyzer] PointerSubChecker should not warn
 on pointers converted to numerical type (#111846)

Pointer values casted to integer (non-pointer) type should be able to be
subtracted as usual.
---
 .../StaticAnalyzer/Checkers/PointerSubChecker.cpp    |  4 ++++
 clang/test/Analysis/pointer-sub.c                    | 12 ++++++++++++
 2 files changed, 16 insertions(+)

diff --git a/clang/lib/StaticAnalyzer/Checkers/PointerSubChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/PointerSubChecker.cpp
index f0dc5efd75f7d6..7a85d9e2073068 100644
--- a/clang/lib/StaticAnalyzer/Checkers/PointerSubChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/PointerSubChecker.cpp
@@ -61,6 +61,10 @@ void PointerSubChecker::checkPreStmt(const BinaryOperator *B,
   if (LR->getSymbolicBase() || RR->getSymbolicBase())
     return;
 
+  if (!B->getLHS()->getType()->isPointerType() ||
+      !B->getRHS()->getType()->isPointerType())
+    return;
+
   const auto *ElemLR = dyn_cast<ElementRegion>(LR);
   const auto *ElemRR = dyn_cast<ElementRegion>(RR);
 
diff --git a/clang/test/Analysis/pointer-sub.c b/clang/test/Analysis/pointer-sub.c
index 1c9d676ebb8f24..25fb7f043d468c 100644
--- a/clang/test/Analysis/pointer-sub.c
+++ b/clang/test/Analysis/pointer-sub.c
@@ -1,5 +1,7 @@
 // RUN: %clang_analyze_cc1 -analyzer-checker=security.PointerSub -analyzer-output=text-minimal -verify %s
 
+typedef int * Ptr;
+
 void f1(void) {
   int x, y, z[10];
   int d = &y - &x; // expected-warning{{Subtraction of two pointers that do not point into the same array is undefined behavior}}
@@ -10,6 +12,12 @@ void f1(void) {
   d = &x - (&x + 1); // no-warning
   d = (&x + 0) - &x; // no-warning
   d = (z + 10) - z; // no-warning
+  d = (long long)&y - (long long)&x; // no-warning
+  long long l = 1;
+  d = l - (long long)&y; // no-warning
+  Ptr p1 = &x;
+  Ptr p2 = &y;
+  d = p1 - p2; // expected-warning{{Subtraction of two pointers that do not point into the same array is undefined behavior}}
 }
 
 void f2(void) {
@@ -28,6 +36,10 @@ void f2(void) {
 
   d = (int *)((char *)(&a[4]) + sizeof(int)) - &a[4]; // no-warning (pointers into the same array data)
   d = (int *)((char *)(&a[4]) + 1) - &a[4]; // expected-warning{{Subtraction of two pointers that}}
+
+  long long a1 = (long long)&a[1];
+  long long b1 = (long long)&b[1];
+  d = a1 - b1;
 }
 
 void f3(void) {

From 6a65e98fa7901dc1de91172d065fafb16ce89d77 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Fri, 11 Oct 2024 18:19:21 +0800
Subject: [PATCH 150/177] [InstCombine] Drop range attributes in
 `foldIsPowerOf2` (#111946)

Fixes https://github.com/llvm/llvm-project/issues/111934.
---
 .../InstCombine/InstCombineAndOrXor.cpp       | 18 ++++++++---
 llvm/test/Transforms/InstCombine/ispow2.ll    | 32 +++++++++++++++++++
 2 files changed, 45 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 688601a8ffa543..964616a4eb35e2 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -955,9 +955,11 @@ static Value *foldIsPowerOf2OrZero(ICmpInst *Cmp0, ICmpInst *Cmp1, bool IsAnd,
 }
 
 /// Reduce a pair of compares that check if a value has exactly 1 bit set.
-/// Also used for logical and/or, must be poison safe.
+/// Also used for logical and/or, must be poison safe if range attributes are
+/// dropped.
 static Value *foldIsPowerOf2(ICmpInst *Cmp0, ICmpInst *Cmp1, bool JoinedByAnd,
-                             InstCombiner::BuilderTy &Builder) {
+                             InstCombiner::BuilderTy &Builder,
+                             InstCombinerImpl &IC) {
   // Handle 'and' / 'or' commutation: make the equality check the first operand.
   if (JoinedByAnd && Cmp1->getPredicate() == ICmpInst::ICMP_NE)
     std::swap(Cmp0, Cmp1);
@@ -971,7 +973,10 @@ static Value *foldIsPowerOf2(ICmpInst *Cmp0, ICmpInst *Cmp1, bool JoinedByAnd,
       match(Cmp1, m_SpecificICmp(ICmpInst::ICMP_ULT,
                                  m_Intrinsic<Intrinsic::ctpop>(m_Specific(X)),
                                  m_SpecificInt(2)))) {
-    Value *CtPop = Cmp1->getOperand(0);
+    auto *CtPop = cast<Instruction>(Cmp1->getOperand(0));
+    // Drop range attributes and re-infer them in the next iteration.
+    CtPop->dropPoisonGeneratingAnnotations();
+    IC.addToWorklist(CtPop);
     return Builder.CreateICmpEQ(CtPop, ConstantInt::get(CtPop->getType(), 1));
   }
   // (X == 0) || (ctpop(X) u> 1) --> ctpop(X) != 1
@@ -980,7 +985,10 @@ static Value *foldIsPowerOf2(ICmpInst *Cmp0, ICmpInst *Cmp1, bool JoinedByAnd,
       match(Cmp1, m_SpecificICmp(ICmpInst::ICMP_UGT,
                                  m_Intrinsic<Intrinsic::ctpop>(m_Specific(X)),
                                  m_SpecificInt(1)))) {
-    Value *CtPop = Cmp1->getOperand(0);
+    auto *CtPop = cast<Instruction>(Cmp1->getOperand(0));
+    // Drop range attributes and re-infer them in the next iteration.
+    CtPop->dropPoisonGeneratingAnnotations();
+    IC.addToWorklist(CtPop);
     return Builder.CreateICmpNE(CtPop, ConstantInt::get(CtPop->getType(), 1));
   }
   return nullptr;
@@ -3375,7 +3383,7 @@ Value *InstCombinerImpl::foldAndOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
     if (Value *V = foldSignedTruncationCheck(LHS, RHS, I, Builder))
       return V;
 
-  if (Value *V = foldIsPowerOf2(LHS, RHS, IsAnd, Builder))
+  if (Value *V = foldIsPowerOf2(LHS, RHS, IsAnd, Builder, *this))
     return V;
 
   if (Value *V = foldPowerOf2AndShiftedMask(LHS, RHS, IsAnd, Builder))
diff --git a/llvm/test/Transforms/InstCombine/ispow2.ll b/llvm/test/Transforms/InstCombine/ispow2.ll
index c21ad95f83a1c4..832c066370b0f8 100644
--- a/llvm/test/Transforms/InstCombine/ispow2.ll
+++ b/llvm/test/Transforms/InstCombine/ispow2.ll
@@ -1522,3 +1522,35 @@ define <2 x i1> @not_pow2_or_z_known_bits_fail_wrong_cmp(<2 x i32> %xin) {
   %r = icmp ugt <2 x i32> %cnt, <i32 2, i32 2>
   ret <2 x i1> %r
 }
+
+; Make sure that range attributes on return values are dropped after merging these two icmps
+
+define i1 @has_single_bit(i32 %x) {
+; CHECK-LABEL: @has_single_bit(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[POPCNT:%.*]] = call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]])
+; CHECK-NEXT:    [[SEL:%.*]] = icmp eq i32 [[POPCNT]], 1
+; CHECK-NEXT:    ret i1 [[SEL]]
+;
+entry:
+  %cmp1 = icmp ne i32 %x, 0
+  %popcnt = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x)
+  %cmp2 = icmp ult i32 %popcnt, 2
+  %sel = select i1 %cmp1, i1 %cmp2, i1 false
+  ret i1 %sel
+}
+
+define i1 @has_single_bit_inv(i32 %x) {
+; CHECK-LABEL: @has_single_bit_inv(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[POPCNT:%.*]] = call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]])
+; CHECK-NEXT:    [[SEL:%.*]] = icmp ne i32 [[POPCNT]], 1
+; CHECK-NEXT:    ret i1 [[SEL]]
+;
+entry:
+  %cmp1 = icmp eq i32 %x, 0
+  %popcnt = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x)
+  %cmp2 = icmp ugt i32 %popcnt, 1
+  %sel = select i1 %cmp1, i1 true, i1 %cmp2
+  ret i1 %sel
+}

From 65da32c634a8345fcbe021f69fc6a609d074c08c Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Fri, 11 Oct 2024 11:26:57 +0100
Subject: [PATCH 151/177] [LV] Account for any-of reduction when computing
 costs of blend phis.

Any-of reductions are narrowed to i1. Update the legacy cost model to
use the correct type when computing the cost of a phi that gets lowered
to selects (BLEND).

This fixes a divergence between legacy and VPlan-based cost models after
36fc291b6ec6d.

Fixes https://github.com/llvm/llvm-project/issues/111874.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  24 ++-
 .../RISCV/blend-any-of-reduction-cost.ll      | 167 ++++++++++++++++++
 2 files changed, 189 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 05dc58a42249ca..54f57fb0b6b58e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6480,12 +6480,32 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
     // converted into select instructions. We require N - 1 selects per phi
     // node, where N is the number of incoming values.
-    if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
+    if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) {
+      Type *ResultTy = Phi->getType();
+
+      // All instructions in an Any-of reduction chain are narrowed to bool.
+      // Check if that is the case for this phi node.
+      auto *HeaderUser = cast_if_present<PHINode>(
+          find_singleton<User>(Phi->users(), [this](User *U, bool) -> User * {
+            auto *Phi = dyn_cast<PHINode>(U);
+            if (Phi && Phi->getParent() == TheLoop->getHeader())
+              return Phi;
+            return nullptr;
+          }));
+      if (HeaderUser) {
+        auto &ReductionVars = Legal->getReductionVars();
+        auto Iter = ReductionVars.find(HeaderUser);
+        if (Iter != ReductionVars.end() &&
+            RecurrenceDescriptor::isAnyOfRecurrenceKind(
+                Iter->second.getRecurrenceKind()))
+          ResultTy = Type::getInt1Ty(Phi->getContext());
+      }
       return (Phi->getNumIncomingValues() - 1) *
              TTI.getCmpSelInstrCost(
-                 Instruction::Select, ToVectorTy(Phi->getType(), VF),
+                 Instruction::Select, ToVectorTy(ResultTy, VF),
                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
+    }
 
     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
   }
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll
new file mode 100644
index 00000000000000..7db47cb9171d24
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll
@@ -0,0 +1,167 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -S %s | FileCheck %s
+
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "riscv64-unknown-linux-gnu"
+
+; Test case for https://github.com/llvm/llvm-project/issues/111874.
+define i32 @any_of_reduction_used_in_blend(ptr %src, i64 %N, i1 %c.0, i1 %c.1) #0 {
+; CHECK-LABEL: define i32 @any_of_reduction_used_in_blend(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]], i1 [[C_0:%.*]], i1 [[C_1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[ANY_OF_RED:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[ANY_OF_RED_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH]] ]
+; CHECK-NEXT:    br i1 [[C_0]], label %[[LOOP_LATCH]], label %[[ELSE_1:.*]]
+; CHECK:       [[ELSE_1]]:
+; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[ELSE_2:.*]]
+; CHECK:       [[ELSE_2]]:
+; CHECK-NEXT:    [[L:%.*]] = load ptr, ptr [[SRC]], align 8
+; CHECK-NEXT:    [[C_2:%.*]] = icmp eq ptr [[L]], null
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[C_2]], i32 0, i32 [[ANY_OF_RED]]
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[ANY_OF_RED_NEXT]] = phi i32 [ [[ANY_OF_RED]], %[[LOOP_HEADER]] ], [ [[ANY_OF_RED]], %[[ELSE_1]] ], [ [[SEL]], %[[ELSE_2]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[ANY_OF_RED_NEXT]], %[[LOOP_LATCH]] ]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %any.of.red = phi i32 [ 0, %entry ], [ %any.of.red.next, %loop.latch ]
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  br i1 %c.0, label %loop.latch, label %else.1
+
+else.1:
+  br i1 %c.1, label %loop.latch, label %else.2
+
+else.2:
+  %l = load ptr, ptr %src, align 8
+  %c.2 = icmp eq ptr %l, null
+  %sel = select i1 %c.2, i32 0, i32 %any.of.red
+  br label %loop.latch
+
+loop.latch:
+  %any.of.red.next = phi i32 [ %any.of.red, %loop.header ], [ %any.of.red, %else.1 ], [ %sel, %else.2 ]
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %N
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  %res = phi i32 [ %any.of.red.next, %loop.latch ]
+  ret i32 %res
+}
+
+define i32 @any_of_reduction_used_in_blend_with_mutliple_phis(ptr %src, i64 %N, i1 %c.0, i1 %c.1) #0 {
+; CHECK-LABEL: define i32 @any_of_reduction_used_in_blend_with_mutliple_phis(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]], i1 [[C_0:%.*]], i1 [[C_1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i1> poison, i1 [[C_0]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i1> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i1> poison, i1 [[C_1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i1> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[SRC]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT3]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PREDPHI:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <vscale x 2 x i1> [[BROADCAST_SPLAT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <vscale x 2 x i1> [[BROADCAST_SPLAT2]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP8:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i1> [[TMP7]], <vscale x 2 x i1> zeroinitializer
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x ptr> @llvm.masked.gather.nxv2p0.nxv2p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT4]], i32 8, <vscale x 2 x i1> [[TMP8]], <vscale x 2 x ptr> poison)
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <vscale x 2 x ptr> [[WIDE_MASKED_GATHER]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = or <vscale x 2 x i1> [[VEC_PHI]], [[TMP9]]
+; CHECK-NEXT:    [[PREDPHI]] = select <vscale x 2 x i1> [[TMP8]], <vscale x 2 x i1> [[TMP10]], <vscale x 2 x i1> [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1(<vscale x 2 x i1> [[PREDPHI]])
+; CHECK-NEXT:    [[TMP13:%.*]] = freeze i1 [[TMP12]]
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP13]], i32 0, i32 0
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[ANY_OF_RED:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ANY_OF_RED_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH]] ]
+; CHECK-NEXT:    br i1 [[C_0]], label %[[X_1:.*]], label %[[ELSE_1:.*]]
+; CHECK:       [[ELSE_1]]:
+; CHECK-NEXT:    br i1 [[C_1]], label %[[X_1]], label %[[ELSE_2:.*]]
+; CHECK:       [[ELSE_2]]:
+; CHECK-NEXT:    [[L:%.*]] = load ptr, ptr [[SRC]], align 8
+; CHECK-NEXT:    [[C_2:%.*]] = icmp eq ptr [[L]], null
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[C_2]], i32 0, i32 [[ANY_OF_RED]]
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[X_1]]:
+; CHECK-NEXT:    [[P:%.*]] = phi i32 [ [[ANY_OF_RED]], %[[LOOP_HEADER]] ], [ [[ANY_OF_RED]], %[[ELSE_1]] ]
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[ANY_OF_RED_NEXT]] = phi i32 [ [[P]], %[[X_1]] ], [ [[SEL]], %[[ELSE_2]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[ANY_OF_RED_NEXT]], %[[LOOP_LATCH]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %any.of.red = phi i32 [ 0, %entry ], [ %any.of.red.next, %loop.latch ]
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  br i1 %c.0, label %x.1, label %else.1
+
+else.1:
+  br i1 %c.1, label %x.1, label %else.2
+
+else.2:
+  %l = load ptr, ptr %src, align 8
+  %c.2 = icmp eq ptr %l, null
+  %sel = select i1 %c.2, i32 0, i32 %any.of.red
+  br label %loop.latch
+
+x.1:
+  %p = phi i32 [ %any.of.red, %loop.header ], [ %any.of.red, %else.1 ]
+  br label %loop.latch
+
+loop.latch:
+  %any.of.red.next = phi i32 [ %p, %x.1 ], [ %sel, %else.2 ]
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %N
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  %res = phi i32 [ %any.of.red.next, %loop.latch ]
+  ret i32 %res
+}
+
+attributes #0 = { "target-cpu"="sifive-p670" }
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.

From d941254da94c8a5897689a74012a57de279c2c9e Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Fri, 11 Oct 2024 11:00:07 +0000
Subject: [PATCH 152/177] [lldb][test] Fix var name typo in
 TestProcessSaveCoreMinidump

---
 .../TestProcessSaveCoreMinidump.py               | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py b/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py
index 4818dde4f3b838..808de687e6ea2e 100644
--- a/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py
+++ b/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py
@@ -117,14 +117,14 @@ def test_save_linux_mini_dump(self):
             expected_number_of_threads = process.GetNumThreads()
             expected_threads = []
             stacks_to_sp_map = {}
-            stakcs_to_registers_map = {}
+            stacks_to_registers_map = {}
 
             for thread_idx in range(process.GetNumThreads()):
                 thread = process.GetThreadAtIndex(thread_idx)
                 thread_id = thread.GetThreadID()
                 expected_threads.append(thread_id)
                 stacks_to_sp_map[thread_id] = thread.GetFrameAtIndex(0).GetSP()
-                stakcs_to_registers_map[thread_id] = thread.GetFrameAtIndex(
+                stacks_to_registers_map[thread_id] = thread.GetFrameAtIndex(
                     0
                 ).GetRegisters()
 
@@ -138,7 +138,7 @@ def test_save_linux_mini_dump(self):
                 expected_modules,
                 expected_threads,
                 stacks_to_sp_map,
-                stakcs_to_registers_map,
+                stacks_to_registers_map,
             )
 
             self.runCmd(base_command + " --style=modified-memory '%s'" % (core_dirty))
@@ -149,7 +149,7 @@ def test_save_linux_mini_dump(self):
                 expected_modules,
                 expected_threads,
                 stacks_to_sp_map,
-                stakcs_to_registers_map,
+                stacks_to_registers_map,
             )
 
             self.runCmd(base_command + " --style=full '%s'" % (core_full))
@@ -160,7 +160,7 @@ def test_save_linux_mini_dump(self):
                 expected_modules,
                 expected_threads,
                 stacks_to_sp_map,
-                stakcs_to_registers_map,
+                stacks_to_registers_map,
             )
 
             options = lldb.SBSaveCoreOptions()
@@ -178,7 +178,7 @@ def test_save_linux_mini_dump(self):
                 expected_modules,
                 expected_threads,
                 stacks_to_sp_map,
-                stakcs_to_registers_map,
+                stacks_to_registers_map,
             )
 
             options = lldb.SBSaveCoreOptions()
@@ -195,7 +195,7 @@ def test_save_linux_mini_dump(self):
                 expected_modules,
                 expected_threads,
                 stacks_to_sp_map,
-                stakcs_to_registers_map,
+                stacks_to_registers_map,
             )
 
             # Minidump can now save full core files, but they will be huge and
@@ -214,7 +214,7 @@ def test_save_linux_mini_dump(self):
                 expected_modules,
                 expected_threads,
                 stacks_to_sp_map,
-                stakcs_to_registers_map,
+                stacks_to_registers_map,
             )
 
             self.assertSuccess(process.Kill())

From 4451f9f812d458f6b53785b27869674caf01e67b Mon Sep 17 00:00:00 2001
From: Sebastian Kreutzer <sebastian.kreutzer@stud.tu-darmstadt.de>
Date: Fri, 11 Oct 2024 07:11:03 -0400
Subject: [PATCH 153/177] [XRay] Fix LLVM include in xray_interface.cpp
 (#111978)

Removes a dependency on LLVM in `xray_interface.cpp` by replacing
`llvm_unreachable` with compiler-rt's `UNREACHABLE`.
Applies clang-format to some unformatted changes.

Original PR: #90959
---
 clang/include/clang/Driver/XRayArgs.h     |  4 +--
 clang/lib/Driver/XRayArgs.cpp             |  8 ++---
 compiler-rt/include/xray/xray_interface.h | 40 +++++++++++++----------
 compiler-rt/lib/xray/xray_interface.cpp   |  5 ++-
 4 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/clang/include/clang/Driver/XRayArgs.h b/clang/include/clang/Driver/XRayArgs.h
index 8fbcf469e5bad1..1b5c4a4c42f12a 100644
--- a/clang/include/clang/Driver/XRayArgs.h
+++ b/clang/include/clang/Driver/XRayArgs.h
@@ -36,9 +36,7 @@ class XRayArgs {
                llvm::opt::ArgStringList &CmdArgs, types::ID InputType) const;
 
   bool needsXRayRt() const { return XRayInstrument && XRayRT; }
-  bool needsXRayDSORt() const {
-    return XRayInstrument && XRayRT && XRayShared;
-  }
+  bool needsXRayDSORt() const { return XRayInstrument && XRayRT && XRayShared; }
   llvm::ArrayRef<std::string> modeList() const { return Modes; }
   XRayInstrSet instrumentationBundle() const { return InstrumentationBundle; }
 };
diff --git a/clang/lib/Driver/XRayArgs.cpp b/clang/lib/Driver/XRayArgs.cpp
index 411054e067cb42..d0bb5d4887c184 100644
--- a/clang/lib/Driver/XRayArgs.cpp
+++ b/clang/lib/Driver/XRayArgs.cpp
@@ -63,8 +63,8 @@ XRayArgs::XRayArgs(const ToolChain &TC, const ArgList &Args) {
         << XRayInstrument->getSpelling() << Triple.str();
   }
 
-  if (Args.hasFlag(options::OPT_fxray_shared,
-                   options::OPT_fno_xray_shared, false)) {
+  if (Args.hasFlag(options::OPT_fxray_shared, options::OPT_fno_xray_shared,
+                   false)) {
     XRayShared = true;
 
     // DSO instrumentation is currently limited to x86_64
@@ -75,8 +75,8 @@ XRayArgs::XRayArgs(const ToolChain &TC, const ArgList &Args) {
 
     unsigned PICLvl = std::get<1>(tools::ParsePICArgs(TC, Args));
     if (!PICLvl) {
-      D.Diag(diag::err_opt_not_valid_without_opt)
-          << "-fxray-shared" << "-fPIC";
+      D.Diag(diag::err_opt_not_valid_without_opt) << "-fxray-shared"
+                                                  << "-fPIC";
     }
   }
 
diff --git a/compiler-rt/include/xray/xray_interface.h b/compiler-rt/include/xray/xray_interface.h
index 717cfe292ce416..675ea0cbc48c83 100644
--- a/compiler-rt/include/xray/xray_interface.h
+++ b/compiler-rt/include/xray/xray_interface.h
@@ -93,8 +93,8 @@ enum XRayPatchingStatus {
   FAILED = 3,
 };
 
-/// This tells XRay to patch the instrumentation points in all currently loaded objects. See XRayPatchingStatus
-/// for possible result values.
+/// This tells XRay to patch the instrumentation points in all currently loaded
+/// objects. See XRayPatchingStatus for possible result values.
 extern XRayPatchingStatus __xray_patch();
 
 /// This tells XRay to patch the instrumentation points in the given object.
@@ -105,8 +105,8 @@ extern XRayPatchingStatus __xray_patch_object(int32_t ObjId);
 /// result values.
 extern XRayPatchingStatus __xray_unpatch();
 
-/// Reverses the effect of __xray_patch_object. See XRayPatchingStatus for possible
-/// result values.
+/// Reverses the effect of __xray_patch_object. See XRayPatchingStatus for
+/// possible result values.
 extern XRayPatchingStatus __xray_unpatch_object(int32_t ObjId);
 
 /// This unpacks the given (packed) function id and patches
@@ -114,8 +114,8 @@ extern XRayPatchingStatus __xray_unpatch_object(int32_t ObjId);
 /// result values.
 extern XRayPatchingStatus __xray_patch_function(int32_t FuncId);
 
-/// This patches a specific function in the given object. See XRayPatchingStatus for possible
-/// result values.
+/// This patches a specific function in the given object. See XRayPatchingStatus
+/// for possible result values.
 extern XRayPatchingStatus __xray_patch_function_in_object(int32_t FuncId,
                                                           int32_t ObjId);
 
@@ -129,26 +129,29 @@ extern XRayPatchingStatus __xray_unpatch_function(int32_t FuncId);
 extern XRayPatchingStatus __xray_unpatch_function_in_object(int32_t FuncId,
                                                             int32_t ObjId);
 
-/// This function unpacks the given (packed) function id and returns the address of the corresponding function. We return 0 if we encounter any error, even if 0 may be a valid function
-/// address.
+/// This function unpacks the given (packed) function id and returns the address
+/// of the corresponding function. We return 0 if we encounter any error, even
+/// if 0 may be a valid function address.
 extern uintptr_t __xray_function_address(int32_t FuncId);
 
-/// This function returns the address of the function in the given object provided valid function and object
-/// ids. We return 0 if we encounter any error, even if 0 may be a valid function
-/// address.
+/// This function returns the address of the function in the given object
+/// provided valid function and object ids. We return 0 if we encounter any
+/// error, even if 0 may be a valid function address.
 extern uintptr_t __xray_function_address_in_object(int32_t FuncId,
                                                    int32_t ObjId);
 
-/// This function returns the maximum valid function id for the main executable (object id = 0). Returns 0 if we
-/// encounter errors (when there are no instrumented functions, etc.).
+/// This function returns the maximum valid function id for the main executable
+/// (object id = 0). Returns 0 if we encounter errors (when there are no
+/// instrumented functions, etc.).
 extern size_t __xray_max_function_id();
 
-/// This function returns the maximum valid function id for the given object. Returns 0 if we
-/// encounter errors (when there are no instrumented functions, etc.).
+/// This function returns the maximum valid function id for the given object.
+/// Returns 0 if we encounter errors (when there are no instrumented functions,
+/// etc.).
 extern size_t __xray_max_function_id_in_object(int32_t ObjId);
 
-/// This function returns the number of previously registered objects (executable + loaded DSOs).
-/// Returns 0 if XRay has not been initialized.
+/// This function returns the number of previously registered objects
+/// (executable + loaded DSOs). Returns 0 if XRay has not been initialized.
 extern size_t __xray_num_objects();
 
 /// Unpacks the function id from the given packed id.
@@ -158,7 +161,8 @@ extern int32_t __xray_unpack_function_id(int32_t PackedId);
 extern int32_t __xray_unpack_object_id(int32_t PackedId);
 
 /// Creates and returns a packed id from the given function and object ids.
-/// If the ids do not fit within the reserved number of bits for each part, the high bits are truncated.
+/// If the ids do not fit within the reserved number of bits for each part, the
+/// high bits are truncated.
 extern int32_t __xray_pack_id(int32_t FuncId, int32_t ObjId);
 
 /// Initialize the required XRay data structures. This is useful in cases where
diff --git a/compiler-rt/lib/xray/xray_interface.cpp b/compiler-rt/lib/xray/xray_interface.cpp
index 16e60bfc22cd10..402fc3d07b4e2a 100644
--- a/compiler-rt/lib/xray/xray_interface.cpp
+++ b/compiler-rt/lib/xray/xray_interface.cpp
@@ -13,7 +13,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "xray_interface_internal.h"
-#include "llvm/Support/ErrorHandling.h"
 
 #include <cinttypes>
 #include <cstdio>
@@ -411,9 +410,9 @@ XRayPatchingStatus controlPatching(bool Enable) XRAY_NEVER_INSTRUMENT {
         CombinedStatus = NOT_INITIALIZED;
       break;
     case ONGOING:
-      llvm_unreachable("Status ONGOING should not appear at this point");
+      UNREACHABLE("Status ONGOING should not appear at this point");
     default:
-      llvm_unreachable("Unhandled patching status");
+      UNREACHABLE("Unhandled patching status");
     }
   }
   return CombinedStatus;

From 0163ac1f53abc0a0f6e5b7e56912c1dee67e7f32 Mon Sep 17 00:00:00 2001
From: Mats Petersson <mats.petersson@arm.com>
Date: Fri, 11 Oct 2024 12:23:37 +0100
Subject: [PATCH 154/177] [Flang][OpenMP]Add tests for TODOs and small changes
 to improve messages (#111562)

The bulk of this change are new tests to check that we get a "Not yet
implemneted: *some stuff here*" message when using some not yet
supported OpenMP functionality.

For some of these cases, this also means adding additional clauses to a
filter list in OpenMP.cpp - this changes nothing [to the best of my
understanding] other than allowing the clause to get to the point where
it can be rejected in a TODO with a more clear message. One of the TOOD
filters were missing Mergeable clause, so this was also added and the
existing test updated for the new more specific error message.

There is no functional change intended here.
---
 flang/lib/Lower/OpenMP/OpenMP.cpp                 |  9 ++++++---
 flang/test/Lower/OpenMP/Todo/reduction-inscan.f90 | 14 ++++++++++++++
 flang/test/Lower/OpenMP/Todo/reduction-task.f90   | 12 ++++++++++++
 .../test/Lower/OpenMP/Todo/target-inreduction.f90 | 15 +++++++++++++++
 flang/test/Lower/OpenMP/Todo/task-inreduction.f90 | 15 +++++++++++++++
 flang/test/Lower/OpenMP/Todo/task_mergeable.f90   |  2 +-
 .../OpenMP/Todo/taskgroup-task-reduction.f90      | 10 ++++++++++
 flang/test/Lower/OpenMP/Todo/taskloop.f90         | 13 +++++++++++++
 flang/test/Lower/OpenMP/Todo/taskwait-depend.f90  | 10 ++++++++++
 flang/test/Lower/OpenMP/Todo/taskwait-nowait.f90  |  8 ++++++++
 10 files changed, 104 insertions(+), 4 deletions(-)
 create mode 100644 flang/test/Lower/OpenMP/Todo/reduction-inscan.f90
 create mode 100644 flang/test/Lower/OpenMP/Todo/reduction-task.f90
 create mode 100644 flang/test/Lower/OpenMP/Todo/target-inreduction.f90
 create mode 100644 flang/test/Lower/OpenMP/Todo/task-inreduction.f90
 create mode 100644 flang/test/Lower/OpenMP/Todo/taskgroup-task-reduction.f90
 create mode 100644 flang/test/Lower/OpenMP/Todo/taskloop.f90
 create mode 100644 flang/test/Lower/OpenMP/Todo/taskwait-depend.f90
 create mode 100644 flang/test/Lower/OpenMP/Todo/taskwait-nowait.f90

diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 0735e40ea2ca7e..a89029b720e788 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -1310,8 +1310,8 @@ static void genTaskClauses(lower::AbstractConverter &converter,
   cp.processUntied(clauseOps);
   // TODO Support delayed privatization.
 
-  cp.processTODO<clause::Affinity, clause::Detach, clause::InReduction>(
-      loc, llvm::omp::Directive::OMPD_task);
+  cp.processTODO<clause::Affinity, clause::Detach, clause::InReduction,
+                 clause::Mergeable>(loc, llvm::omp::Directive::OMPD_task);
 }
 
 static void genTaskgroupClauses(lower::AbstractConverter &converter,
@@ -2780,7 +2780,10 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
         !std::holds_alternative<clause::ThreadLimit>(clause.u) &&
         !std::holds_alternative<clause::Threads>(clause.u) &&
         !std::holds_alternative<clause::UseDeviceAddr>(clause.u) &&
-        !std::holds_alternative<clause::UseDevicePtr>(clause.u)) {
+        !std::holds_alternative<clause::UseDevicePtr>(clause.u) &&
+        !std::holds_alternative<clause::InReduction>(clause.u) &&
+        !std::holds_alternative<clause::Mergeable>(clause.u) &&
+        !std::holds_alternative<clause::TaskReduction>(clause.u)) {
       TODO(clauseLocation, "OpenMP Block construct clause");
     }
   }
diff --git a/flang/test/Lower/OpenMP/Todo/reduction-inscan.f90 b/flang/test/Lower/OpenMP/Todo/reduction-inscan.f90
new file mode 100644
index 00000000000000..c5f196fe09693a
--- /dev/null
+++ b/flang/test/Lower/OpenMP/Todo/reduction-inscan.f90
@@ -0,0 +1,14 @@
+! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s
+! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s
+
+! CHECK: not yet implemented: Reduction modifiers are not supported
+subroutine reduction_inscan()
+  integer :: i,j
+  i = 0
+
+  !$omp do reduction(inscan, +:i)
+  do j=1,10
+     i = i + 1
+  end do
+  !$omp end do
+end subroutine reduction_inscan
diff --git a/flang/test/Lower/OpenMP/Todo/reduction-task.f90 b/flang/test/Lower/OpenMP/Todo/reduction-task.f90
new file mode 100644
index 00000000000000..6707f65e1a4cc3
--- /dev/null
+++ b/flang/test/Lower/OpenMP/Todo/reduction-task.f90
@@ -0,0 +1,12 @@
+! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s
+! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s
+
+! CHECK: not yet implemented: Reduction modifiers are not supported
+subroutine reduction_task()
+  integer :: i
+  i = 0
+
+  !$omp parallel reduction(task, +:i)
+  i = i + 1
+  !$omp end parallel 
+end subroutine reduction_task
diff --git a/flang/test/Lower/OpenMP/Todo/target-inreduction.f90 b/flang/test/Lower/OpenMP/Todo/target-inreduction.f90
new file mode 100644
index 00000000000000..e5a9cffac5a117
--- /dev/null
+++ b/flang/test/Lower/OpenMP/Todo/target-inreduction.f90
@@ -0,0 +1,15 @@
+! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
+! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
+
+!===============================================================================
+! `mergeable` clause
+!===============================================================================
+
+! CHECK: not yet implemented: Unhandled clause IN_REDUCTION in TARGET construct
+subroutine omp_target_inreduction()
+  integer i
+  i = 0
+  !$omp target in_reduction(+:i)
+  i = i + 1
+  !$omp end target
+end subroutine omp_target_inreduction
diff --git a/flang/test/Lower/OpenMP/Todo/task-inreduction.f90 b/flang/test/Lower/OpenMP/Todo/task-inreduction.f90
new file mode 100644
index 00000000000000..aeed680a6dba7c
--- /dev/null
+++ b/flang/test/Lower/OpenMP/Todo/task-inreduction.f90
@@ -0,0 +1,15 @@
+! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s
+! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s
+
+!===============================================================================
+! `mergeable` clause
+!===============================================================================
+
+! CHECK: not yet implemented: Unhandled clause IN_REDUCTION in TASK construct
+subroutine omp_task_in_reduction()
+  integer i
+  i = 0
+  !$omp task in_reduction(+:i)
+  i = i + 1
+  !$omp end task
+end subroutine omp_task_in_reduction
diff --git a/flang/test/Lower/OpenMP/Todo/task_mergeable.f90 b/flang/test/Lower/OpenMP/Todo/task_mergeable.f90
index 13145d92ccf902..ddc27487abfe9c 100644
--- a/flang/test/Lower/OpenMP/Todo/task_mergeable.f90
+++ b/flang/test/Lower/OpenMP/Todo/task_mergeable.f90
@@ -5,7 +5,7 @@
 ! `mergeable` clause
 !===============================================================================
 
-! CHECK: not yet implemented: OpenMP Block construct clause
+! CHECK: not yet implemented: Unhandled clause MERGEABLE in TASK construct
 subroutine omp_task_mergeable()
   !$omp task mergeable
   call foo()
diff --git a/flang/test/Lower/OpenMP/Todo/taskgroup-task-reduction.f90 b/flang/test/Lower/OpenMP/Todo/taskgroup-task-reduction.f90
new file mode 100644
index 00000000000000..1cb471d784d766
--- /dev/null
+++ b/flang/test/Lower/OpenMP/Todo/taskgroup-task-reduction.f90
@@ -0,0 +1,10 @@
+! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s -fopenmp-version=50 2>&1 | FileCheck %s
+! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s -fopenmp-version=50 2>&1 | FileCheck %s
+
+! CHECK: not yet implemented: Unhandled clause TASK_REDUCTION in TASKGROUP construct
+subroutine omp_taskgroup_task_reduction
+  integer :: res
+  !$omp taskgroup task_reduction(+:res)
+  res = res + 1
+  !$omp end taskgroup
+end subroutine omp_taskgroup_task_reduction
diff --git a/flang/test/Lower/OpenMP/Todo/taskloop.f90 b/flang/test/Lower/OpenMP/Todo/taskloop.f90
new file mode 100644
index 00000000000000..aca050584cbbe3
--- /dev/null
+++ b/flang/test/Lower/OpenMP/Todo/taskloop.f90
@@ -0,0 +1,13 @@
+! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s -fopenmp-version=50 2>&1 | FileCheck %s
+! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s -fopenmp-version=50 2>&1 | FileCheck %s
+
+! CHECK: not yet implemented: Taskloop construct
+subroutine omp_taskloop
+  integer :: res, i
+  !$omp taskloop
+  do i = 1, 10
+     res = res + 1
+  end do
+  !$omp end taskloop
+end subroutine omp_taskloop
+
diff --git a/flang/test/Lower/OpenMP/Todo/taskwait-depend.f90 b/flang/test/Lower/OpenMP/Todo/taskwait-depend.f90
new file mode 100644
index 00000000000000..d1f953be8802fa
--- /dev/null
+++ b/flang/test/Lower/OpenMP/Todo/taskwait-depend.f90
@@ -0,0 +1,10 @@
+! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s -fopenmp-version=50 2>&1 | FileCheck %s
+! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s -fopenmp-version=50 2>&1 | FileCheck %s
+
+! CHECK: not yet implemented: Unhandled clause DEPEND in TASKWAIT construct
+subroutine omp_tw_depend
+  integer :: res
+  !$omp taskwait depend(out: res)
+  res = res + 1
+end subroutine omp_tw_depend
+
diff --git a/flang/test/Lower/OpenMP/Todo/taskwait-nowait.f90 b/flang/test/Lower/OpenMP/Todo/taskwait-nowait.f90
new file mode 100644
index 00000000000000..21e8609b08ba37
--- /dev/null
+++ b/flang/test/Lower/OpenMP/Todo/taskwait-nowait.f90
@@ -0,0 +1,8 @@
+! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s -fopenmp-version=51 2>&1 | FileCheck %s
+! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s -fopenmp-version=51 2>&1 | FileCheck %s
+
+! CHECK: not yet implemented: Unhandled clause NOWAIT in TASKWAIT construct
+subroutine omp_tw_nowait
+  !$omp taskwait nowait
+end subroutine omp_tw_nowait
+

From b5ea5be2a714e28bac57d417c221f687efe396bf Mon Sep 17 00:00:00 2001
From: Sam Elliott <quic_aelliott@quicinc.com>
Date: Fri, 11 Oct 2024 13:24:54 +0200
Subject: [PATCH 155/177] [RISCV][MC] Fix >32bit .insn Directives (#111878)

The original patch had a reasonably significant bug. You could not use
`.insn` to assemble encodings that had any bits set above the low 32
bits. This is due to the fact that `getMachineOpValue` was truncating
the immediate value, and I did not commit enough tests of useful cases.

This changes the result of `getMachineOpValue` to be able to return the
48-bit and 64-bit immediates needed for the wider `.insn` directives.

I took the opportunity to move some of the test cases around in the file
to make looking at the output of `llvm-objdump` a little clearer.
---
 .../RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp |  6 ++--
 llvm/test/MC/RISCV/insn.s                     | 35 +++++++++++++++----
 2 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
index 66970ed37f2724..54f1a3899c4957 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
@@ -77,7 +77,7 @@ class RISCVMCCodeEmitter : public MCCodeEmitter {
 
   /// Return binary encoding of operand. If the machine operand requires
   /// relocation, record the relocation and return zero.
-  unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+  uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
                              SmallVectorImpl<MCFixup> &Fixups,
                              const MCSubtargetInfo &STI) const;
 
@@ -375,7 +375,7 @@ void RISCVMCCodeEmitter::encodeInstruction(const MCInst &MI,
   ++MCNumEmitted; // Keep track of the # of mi's emitted.
 }
 
-unsigned
+uint64_t
 RISCVMCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO,
                                       SmallVectorImpl<MCFixup> &Fixups,
                                       const MCSubtargetInfo &STI) const {
@@ -384,7 +384,7 @@ RISCVMCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO,
     return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
 
   if (MO.isImm())
-    return static_cast<unsigned>(MO.getImm());
+    return MO.getImm();
 
   llvm_unreachable("Unhandled expression!");
   return 0;
diff --git a/llvm/test/MC/RISCV/insn.s b/llvm/test/MC/RISCV/insn.s
index e32fec25bb16b4..d24f4fe8b36374 100644
--- a/llvm/test/MC/RISCV/insn.s
+++ b/llvm/test/MC/RISCV/insn.s
@@ -170,17 +170,40 @@ target:
 # CHECK-OBJ: <unknown>
 .insn 6, 0x1f
 
-# CHECK-ASM: .insn 0x4, 65503
-# CHECK-ASM: encoding: [0xdf,0xff,0x00,0x00]
-# CHECK-OBJ: <unknown>
-.insn 0xffdf
-
 # CHECK-ASM: .insn 0x8, 63
 # CHECK-ASM: encoding: [0x3f,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
 # CHECK-OBJ: <unknown>
 .insn 8, 0x3f
 
+# CHECK-ASM: .insn 0x6, 281474976710623
+# CHECK-ASM: encoding: [0xdf,0xff,0xff,0xff,0xff,0xff]
+# CHECK-OBJ: <unknown>
+.insn 0x6, 0xffffffffffdf
+
+# CHECK-ASM: .insn 0x8, -65
+# CHECK-ASM: encoding: [0xbf,0xff,0xff,0xff,0xff,0xff,0xff,0xff]
+# CHECK-OBJ: <unknown>
+.insn 0x8, 0xffffffffffffffbf
+
+odd_lengths:
+# CHECK-ASM-LABEL: odd_lengths:
+# CHECK-OBJ-LABEL: <odd_lengths>:
+
+## These deliberately disagree with the lengths objdump expects them to have, so
+## keep them at the end so that the disassembled instruction stream is not out
+## of sync with the encoded instruction stream. We don't check for `<unknown>`
+## as we could get any number of those, so instead check for the encoding
+## halfwords. These might be split into odd 16-bit chunks, so each chunk is on
+## one line.
+
+# CHECK-ASM: .insn 0x4, 65503
+# CHECK-ASM: encoding: [0xdf,0xff,0x00,0x00]
+# CHECK-OBJ: ffdf
+# CHECK-OBJ: 0000
+.insn 0xffdf
+
 # CHECK-ASM: .insn 0x4, 65471
 # CHECK-ASM: encoding: [0xbf,0xff,0x00,0x00]
-# CHECK-OBJ: <unknown>
+# CHECK-OBJ: ffbf
+# CHECK-OBJ: 0000
 .insn 0xffbf

From 303c8d20601d810c177f6646f771c1eb3f29ab8c Mon Sep 17 00:00:00 2001
From: Rin Dobrescu <irina.dobrescu@arm.com>
Date: Fri, 11 Oct 2024 12:29:44 +0100
Subject: [PATCH 156/177] [AArch64] Add SchedReadAdvance to Neoverse-V1
 scheduling model. (#111538)

Introduce a description of late forwarding to the Neoverse-V1 Scheduling model.
---
 .../Target/AArch64/AArch64SchedNeoverseV1.td  |  207 ++-
 .../llvm-mca/AArch64/Neoverse/V1-forwarding.s | 1421 +++++++++++++++++
 .../AArch64/Neoverse/V1-neon-instructions.s   |  138 +-
 3 files changed, 1645 insertions(+), 121 deletions(-)
 create mode 100644 llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-forwarding.s

diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td
index f7e6545f0dd386..fb4d2f3d7bcd3a 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td
@@ -469,6 +469,89 @@ def V1Write_11c_9L01_9S_9V : SchedWriteRes<[V1UnitL01, V1UnitL01, V1UnitL01,
                                             V1UnitV, V1UnitV, V1UnitV,
                                             V1UnitV, V1UnitV, V1UnitV]>;
 
+//===----------------------------------------------------------------------===//
+// Define forwarded types
+
+// NOTE: SOG, p. 20, n. 2: Accumulator forwarding is not supported for
+// consumers of 64 bit multiply high operations?
+def V1Wr_IM   : SchedWriteRes<[V1UnitM]>  { let Latency = 2; }
+def V1Wr_IMA  : SchedWriteRes<[V1UnitM0]> { let Latency = 2; }
+def V1WriteIM : SchedWriteVariant<
+                  [SchedVar<NeoverseMULIdiomPred, [V1Wr_IM]>,
+                   SchedVar<NoSchedPred,          [V1Wr_IMA]>]>;
+def V1Rd_IMA : SchedReadAdvance<1, [V1Wr_IMA]>;
+
+def V1Wr_FMA : SchedWriteRes<[V1UnitV]> { let Latency = 4; }
+def V1Rd_FMA : SchedReadAdvance<2, [WriteFMul, V1Wr_FMA]>;
+
+def V1Wr_ADA : SchedWriteRes<[V1UnitV13]> { let Latency = 4; }
+def V1Rd_ADA : SchedReadAdvance<3, [V1Wr_ADA]>;
+
+def V1Wr_VDOT : SchedWriteRes<[V1UnitV]> { let Latency = 3; }
+def V1Rd_VDOT : SchedReadAdvance<2, [V1Wr_VDOT]>;
+
+def V1Wr_VMMA : SchedWriteRes<[V1UnitV]> { let Latency = 3; }
+def V1Rd_VMMA : SchedReadAdvance<2, [V1Wr_VMMA]>;
+
+def V1Wr_VMA : SchedWriteRes<[V1UnitV02]> { let Latency = 4; }
+def V1Rd_VMA : SchedReadAdvance<3, [V1Wr_VMA]>;
+
+def V1Wr_VMAL : SchedWriteRes<[V1UnitV02]> { let Latency = 4; }
+def V1Rd_VMAL : SchedReadAdvance<3, [V1Wr_VMAL]>;
+
+def V1Wr_VSA : SchedWriteRes<[V1UnitV13]> { let Latency = 4; }
+def V1Rd_VSA : SchedReadAdvance<3, [V1Wr_VSA]>;
+
+def V1Wr_FCMA : SchedWriteRes<[V1UnitV]> { let Latency = 4; }
+def V1Rd_FCMA : SchedReadAdvance<2, [V1Wr_FCMA]>;
+
+def V1Wr_FPM : SchedWriteRes<[V1UnitV]> { let Latency = 3; }
+def V1Wr_FPMA : SchedWriteRes<[V1UnitV]> { let Latency = 4; }
+def V1Rd_FPMA : SchedReadAdvance<2, [V1Wr_FPM, V1Wr_FPMA]>;
+
+def V1Wr_FPMAL : SchedWriteRes<[V1UnitV]> { let Latency = 5; }
+def V1Rd_FPMAL : SchedReadAdvance<3, [V1Wr_FPMAL]>;
+
+def V1Wr_BFD : SchedWriteRes<[V1UnitV]> { let Latency = 4; }
+def V1Rd_BFD : SchedReadAdvance<2, [V1Wr_BFD]>;
+
+def V1Wr_BFMMA : SchedWriteRes<[V1UnitV]> { let Latency = 5; }
+def V1Rd_BFMMA : SchedReadAdvance<2, [V1Wr_BFMMA]>;
+
+def V1Wr_BFMLA : SchedWriteRes<[V1UnitV]> { let Latency = 4; }
+def V1Rd_BFMLA : SchedReadAdvance<2, [V1Wr_BFMLA]>;
+
+def V1Wr_CRC : SchedWriteRes<[V1UnitM0]> { let Latency = 2; }
+def V1Rd_CRC : SchedReadAdvance<1, [V1Wr_CRC]>;
+
+def V1Wr_ZDOTB : SchedWriteRes<[V1UnitV01]> { let Latency = 3; }
+def V1Rd_ZDOTB : SchedReadAdvance<2, [V1Wr_ZDOTB]>;
+
+def V1Wr_ZUDOTB : SchedWriteRes<[V1UnitV]> { let Latency = 3; }
+def V1Rd_ZUDOTB : SchedReadAdvance<2, [V1Wr_ZUDOTB]>;
+
+def V1Wr_ZDOTH : SchedWriteRes<[V1UnitV0]> { let Latency = 4; }
+def V1Rd_ZDOTH : SchedReadAdvance<3, [V1Wr_ZDOTH]>;
+
+def V1Wr_ZMMA : SchedWriteRes<[V1UnitV01]> { let Latency = 3; }
+def V1Rd_ZMMA : SchedReadAdvance<2, [V1Wr_ZMMA]>;
+
+let Latency = 5, NumMicroOps = 2 in
+def V1Wr_ZMAD : SchedWriteRes<[V1UnitV0, V1UnitV0]>;
+def V1Rd_ZMAD : SchedReadAdvance<3, [V1Wr_ZMAD]>;
+
+def V1Wr_ZFCMA : SchedWriteRes<[V1UnitV01]> { let Latency = 5; }
+def V1Rd_ZFCMA : SchedReadAdvance<3, [V1Wr_ZFCMA]>;
+
+def V1Wr_ZFMA : SchedWriteRes<[V1UnitV01]> { let Latency = 4; }
+def V1Rd_ZFMA : SchedReadAdvance<2, [V1Wr_ZFMA]>;
+
+def V1Wr_ZBFDOT : SchedWriteRes<[V1UnitV01]> { let Latency = 4; }
+def V1Rd_ZBFDOT : SchedReadAdvance<2, [V1Wr_ZBFDOT]>;
+def V1Wr_ZBFMMA : SchedWriteRes<[V1UnitV01]> { let Latency = 5; }
+def V1Rd_ZBFMMA : SchedReadAdvance<2, [V1Wr_ZBFMMA]>;
+def V1Wr_ZBFMAL : SchedWriteRes<[V1UnitV01]> { let Latency = 5; }
+def V1Rd_ZBFMAL : SchedReadAdvance<3, [V1Wr_ZBFMAL]>;
 
 // Miscellaneous Instructions
 // -----------------------------------------------------------------------------
@@ -553,16 +636,19 @@ def : InstRW<[V1Write_1c_1J], (instrs SETF8, SETF16, RMIF, CFINV)>;
 def : SchedAlias<WriteID32, V1Write_12c5_1M0>;
 def : SchedAlias<WriteID64, V1Write_20c5_1M0>;
 
+def : SchedAlias<WriteIM32, V1Write_2c_1M>;
+def : SchedAlias<WriteIM64, V1Write_2c_1M>;
+
 // Multiply
-// Multiply accumulate
-// Multiply accumulate, long
-// Multiply long
-def V1WriteIM : SchedWriteVariant<
-                  [SchedVar<NeoverseMULIdiomPred, [V1Write_2c_1M]>,
-                   SchedVar<NoSchedPred,          [V1Write_2c_1M0]>]>;
-def           : SchedAlias<WriteIM32, V1WriteIM>;
-def           : SchedAlias<WriteIM64, V1WriteIM>;
+// Multiply accumulate, W-form
+// Multiply accumulate, X-form
+def : InstRW<[V1WriteIM, ReadIM, ReadIM, V1Rd_IMA],
+             (instregex "^M(ADD|SUB)[WX]rrr$")>;
 
+// Multiply accumulate long
+// Multiply long
+def : InstRW<[V1WriteIM, ReadIM, ReadIM, V1Rd_IMA],
+             (instregex "^(S|U)M(ADD|SUB)Lrrr$")>;
 // Multiply high
 def : InstRW<[V1Write_3c_1M, ReadIM, ReadIM], (instrs SMULHrr, UMULHrr)>;
 
@@ -680,10 +766,11 @@ def : InstRW<[V1Write_15c7_1V02], (instrs FDIVDrr)>;
 def : InstRW<[V1Write_16c7_1V02], (instrs FSQRTDr)>;
 
 // FP multiply
-def : SchedAlias<WriteFMul, V1Write_3c_1V>;
+def : WriteRes<WriteFMul, [V1UnitV]> { let Latency = 3; }
 
 // FP multiply accumulate
-def : InstRW<[V1Write_4c_1V], (instregex "^FN?M(ADD|SUB)[HSD]rrr$")>;
+def : InstRW<[V1Wr_FMA, ReadDefault, ReadDefault, V1Rd_FMA],
+             (instregex "^FN?M(ADD|SUB)[HSD]rrr$")>;
 
 // FP round to integral
 def : InstRW<[V1Write_3c_1V02], (instregex "^FRINT[AIMNPXZ][HSD]r$",
@@ -824,7 +911,7 @@ def : SchedAlias<WriteVq, V1Write_2c_1V>;
 // ASIMD absolute diff accum
 // ASIMD absolute diff accum long
 // ASIMD pairwise add and accumulate long
-def : InstRW<[V1Write_4c_1V13], (instregex "^[SU]ABAL?v", "^[SU]ADALPv")>;
+def : InstRW<[V1Wr_ADA, V1Rd_ADA], (instregex "^[SU]ABAL?v", "^[SU]ADALPv")>;
 
 // ASIMD arith, reduce, 4H/4S
 // ASIMD max/min, reduce, 4H/4S
@@ -843,23 +930,26 @@ def : InstRW<[V1Write_4c_2V13], (instregex "^(ADD|[SU]ADDL)Vv16i8v$",
 
 // ASIMD dot product
 // ASIMD dot product using signed and unsigned integers
-def : InstRW<[V1Write_2c_1V], (instregex "^([SU]|SU|US)DOT(lane)?v(8|16)i8$")>;
+def : InstRW<[V1Wr_VDOT, V1Rd_VDOT],
+             (instregex "^([SU]|SU|US)DOT(lane)?v(8|16)i8$")>;
 
-// ASIMD matrix multiply- accumulate
-def : InstRW<[V1Write_3c_1V], (instrs SMMLA, UMMLA, USMMLA)>;
+// ASIMD matrix multiply-accumulate
+def : InstRW<[V1Wr_VMMA, V1Rd_VMMA], (instrs SMMLA, UMMLA, USMMLA)>;
 
 // ASIMD multiply
+def : InstRW<[V1Write_4c_1V02], (instregex "^MULv", "^SQ(R)?DMULHv")>;
+
 // ASIMD multiply accumulate
+def : InstRW<[V1Wr_VMA, V1Rd_VMA], (instregex "^MLAv", "^MLSv")>;
+
 // ASIMD multiply accumulate long
+def : InstRW<[V1Wr_VMAL, V1Rd_VMAL], (instregex "^[SU]MLALv", "^[SU]MLSLv")>;
+
 // ASIMD multiply accumulate high
+def : InstRW<[V1Write_4c_1V02], (instregex "^SQRDMLAHv", "^SQRDMLSHv")>;
+
 // ASIMD multiply accumulate saturating long
-def : InstRW<[V1Write_4c_1V02], 
-             (instregex "^MUL(v[148]i16|v[124]i32)$",
-                        "^SQR?DMULH(v[48]i16|v[24]i32)$",
-                        "^ML[AS](v[148]i16|v[124]i32)$",
-                        "^[SU]ML[AS]Lv",
-                        "^SQRDML[AS]H(v[148]i16|v[124]i32)$",
-                        "^SQDML[AS]Lv")>;
+def : InstRW<[V1Write_4c_1V02], (instregex "^SQDML[AS]L[iv]")>;
 
 // ASIMD multiply/multiply long (8x8) polynomial
 def : InstRW<[V1Write_3c_1V01], (instregex "^PMULL?v(8|16)i8$")>;
@@ -868,11 +958,12 @@ def : InstRW<[V1Write_3c_1V01], (instregex "^PMULL?v(8|16)i8$")>;
 def : InstRW<[V1Write_3c_1V02], (instregex "^([SU]|SQD)MULLv")>;
 
 // ASIMD shift accumulate
+def : InstRW<[V1Wr_VSA, V1Rd_VSA], (instregex "^[SU]SRAv", "^[SU]RSRAv")>;
+
 // ASIMD shift by immed, complex
 // ASIMD shift by register, complex
 def : InstRW<[V1Write_4c_1V13],
-             (instregex "^[SU]R?SRAv",
-                        "^RSHRNv", "^SQRSHRU?Nv", "^(SQSHLU?|UQSHL)[bhsd]$",
+             (instregex "^RSHRNv", "^SQRSHRU?Nv", "^(SQSHLU?|UQSHL)[bhsd]$",
                         "^(SQSHLU?|UQSHL)(v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v2i64)_shift$",
                         "^SQSHU?RNv", "^[SU]RSHRv", "^UQR?SHRNv", 
                         "^[SU]Q?RSHLv", "^[SU]QSHLv")>;
@@ -890,16 +981,25 @@ def : InstRW<[V1Write_2c_1V13], (instregex "^SHLL?v", "^SHRNv", "^[SU]SHLLv",
 // ASIMD FP absolute value/difference
 // ASIMD FP arith, normal
 // ASIMD FP compare
-// ASIMD FP complex add
 // ASIMD FP max/min, normal
 // ASIMD FP max/min, pairwise
 // ASIMD FP negate
 // Covered by "SchedAlias (WriteV[dq]...)" above
 
+// ASIMD FP complex add
+def : InstRW<[V1Write_4c_1V], (instregex "^FCADD(v[48]f16|v[24]f32|v2f64)$")>;
+
 // ASIMD FP complex multiply add
+def : InstRW<[V1Wr_FCMA, V1Rd_FCMA], (instregex "^FCMLAv")>;
+
+// ASIMD FP multiply
+def : InstRW<[V1Wr_FPM], (instregex "^FMULX?v")>;
+
 // ASIMD FP multiply accumulate
-def : InstRW<[V1Write_4c_1V], (instregex "^FCADD(v[48]f16|v[24]f32|v2f64)$",
-                                         "^FML[AS]v")>;
+def : InstRW<[V1Wr_FPMA, V1Rd_FPMA], (instregex "^FML[AS]v")>;
+
+// ASIMD FP multiply accumulate long
+def : InstRW<[V1Wr_FPMAL, V1Rd_FPMAL], (instregex "^FML[AS]L2?v")>;
 
 // ASIMD FP convert, long (F16 to F32)
 def : InstRW<[V1Write_4c_2V02], (instregex "^FCVTLv[48]i16$")>;
@@ -953,12 +1053,6 @@ def : InstRW<[V1Write_4c_2V], (instregex "^F(MAX|MIN)(NM)?Vv4(i16|i32)v$")>;
 // ASIMD FP max/min, reduce, Q-form F16
 def : InstRW<[V1Write_6c_3V], (instregex "^F(MAX|MIN)(NM)?Vv8i16v$")>;
 
-// ASIMD FP multiply
-def : InstRW<[V1Write_3c_1V], (instregex "^FMULX?v")>;
-
-// ASIMD FP multiply accumulate long
-def : InstRW<[V1Write_5c_1V], (instregex "^FML[AS]L2?v")>;
-
 // ASIMD FP round, D-form F32 and Q-form F64
 def : InstRW<[V1Write_3c_1V02], (instregex "^FRINT[AIMNPXZ]v2f(32|64)$")>;
 
@@ -976,13 +1070,13 @@ def : InstRW<[V1Write_6c_4V02], (instregex "^FRINT[AIMNPXZ]v8f16$")>;
 def : InstRW<[V1Write_4c_1V02], (instrs BFCVTN, BFCVTN2)>;
 
 // ASIMD dot product
-def : InstRW<[V1Write_4c_1V], (instregex "^BF(DOT|16DOTlane)v[48]bf16$")>;
+def : InstRW<[V1Wr_BFD, V1Rd_BFD], (instregex "^BF(DOT|16DOTlane)v[48]bf16$")>;
 
 // ASIMD matrix multiply accumulate
-def : InstRW<[V1Write_5c_1V], (instrs BFMMLA)>;
+def : InstRW<[V1Wr_BFMMA, V1Rd_BFMMA], (instrs BFMMLA)>;
 
 // ASIMD multiply accumulate long
-def : InstRW<[V1Write_4c_1V], (instregex "^BFMLAL[BT](Idx)?$")>;
+def : InstRW<[V1Wr_BFMLA, V1Rd_BFMLA], (instregex "^BFMLAL[BT](Idx)?$")>;
 
 // Scalar convert, F32 to BF16
 def : InstRW<[V1Write_3c_1V02], (instrs BFCVT)>;
@@ -1300,7 +1394,7 @@ def : InstRW<[V1Write_2c_1V0], (instrs BCAX, EOR3, RAX1, XAR)>;
 // -----------------------------------------------------------------------------
 
 // CRC checksum ops
-def : InstRW<[V1Write_2c_1M0], (instregex "^CRC32C?[BHWX]rr$")>;
+def : InstRW<[V1Wr_CRC, V1Rd_CRC], (instregex "^CRC32C?[BHWX]rr$")>;
 
 
 // SVE Predicate instructions
@@ -1440,13 +1534,14 @@ def : InstRW<[V1Write_20c7_1V0], (instregex "^[SU]DIVR?_ZPmZ_D",
                                              "^[SU]DIV_ZPZZ_D")>;
 
 // Dot product, 8 bit
-def : InstRW<[V1Write_3c_1V01], (instregex "^[SU]DOT_ZZZI?_S$")>;
+def : InstRW<[V1Wr_ZDOTB, V1Rd_ZDOTB], (instregex "^[SU]DOT_ZZZI?_S$")>;
 
 // Dot product, 8 bit, using signed and unsigned integers
-def : InstRW<[V1Write_3c_1V], (instrs SUDOT_ZZZI, USDOT_ZZZ, USDOT_ZZZI)>;
+def : InstRW<[V1Wr_ZUDOTB, V1Rd_ZUDOTB],
+             (instrs SUDOT_ZZZI, USDOT_ZZZ, USDOT_ZZZI)>;
 
 // Dot product, 16 bit
-def : InstRW<[V1Write_4c_1V0], (instregex "^[SU]DOT_ZZZI?_D$")>;
+def : InstRW<[V1Wr_ZDOTH, V1Rd_ZDOTH], (instregex "^[SU]DOT_ZZZI?_D$")>;
 
 // Duplicate, immediate and indexed form
 def : InstRW<[V1Write_2c_1V01], (instregex "^DUP_ZI_[BHSD]$",
@@ -1488,7 +1583,7 @@ def : InstRW<[V1Write_2c_1V01], (instregex "^MOVPRFX_ZP[mz]Z_[BHSD]$",
                                            "^MOVPRFX_ZZ$")>;
 
 // Matrix multiply-accumulate
-def : InstRW<[V1Write_3c_1V01], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>;
+def : InstRW<[V1Wr_ZMMA, V1Rd_ZMMA], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>;
 
 // Multiply, B, H, S element size
 def : InstRW<[V1Write_4c_1V0], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_[BHS]",
@@ -1497,12 +1592,16 @@ def : InstRW<[V1Write_4c_1V0], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_[BHS]",
                                           "^[SU]MULH_ZPZZ_[BHS]")>;
 
 // Multiply, D element size
-// Multiply accumulate, D element size
 def : InstRW<[V1Write_5c_2V0], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_D",
                                           "^MUL_ZPZZ_D",
                                           "^[SU]MULH_(ZPmZ|ZZZ)_D",
-                                          "^[SU]MULH_ZPZZ_D",
-                                          "^(MLA|MLS|MAD|MSB)_(ZPmZZ|ZPZZZ)_D")>;
+                                          "^[SU]MULH_ZPZZ_D")>;
+
+// Multiply accumulate, D element size
+def : InstRW<[V1Wr_ZMAD, V1Rd_ZMAD],
+             (instregex "^ML[AS]_ZPZZZ_D")>;
+def : InstRW<[V1Wr_ZMAD, ReadDefault, V1Rd_ZMAD],
+             (instregex "^(ML[AS]|MAD|MSB)_ZPmZZ_D")>;
 
 // Multiply accumulate, B, H, S element size
 // NOTE: This is not specified in the SOG.
@@ -1583,8 +1682,8 @@ def : InstRW<[V1Write_2c_1V0], (instregex "^FAC(GE|GT)_PPzZZ_[HSD]$",
 def : InstRW<[V1Write_3c_1V01], (instregex "^FCADD_ZPmZ_[HSD]$")>;
 
 // Floating point complex multiply add
-def : InstRW<[V1Write_5c_1V01], (instregex "^FCMLA_ZPmZZ_[HSD]$",
-                                           "^FCMLA_ZZZI_[HS]$")>;
+def : InstRW<[V1Wr_ZFCMA, ReadDefault, V1Rd_ZFCMA], (instregex "^FCMLA_ZPmZZ_[HSD]")>;
+def : InstRW<[V1Wr_ZFCMA, V1Rd_ZFCMA],              (instregex "^FCMLA_ZZZI_[HS]")>;
 
 // Floating point convert, long or narrow (F16 to F32 or F32 to F16)
 // Floating point convert to integer, F32
@@ -1623,11 +1722,15 @@ def : InstRW<[V1Write_3c_1V01], (instregex "^(FSCALE|FMULX)_ZPmZ_[HSD]",
                                            "^FMUL_ZPZ[IZ]_[HSD]")>;
 
 // Floating point multiply accumulate
+def : InstRW<[V1Wr_ZFMA, ReadDefault, V1Rd_ZFMA],
+             (instregex "^FN?ML[AS]_ZPmZZ_[HSD]",
+                        "^FN?(MAD|MSB)_ZPmZZ_[HSD]")>;
+def : InstRW<[V1Wr_ZFMA, V1Rd_ZFMA],
+             (instregex "^FML[AS]_ZZZI_[HSD]",
+                        "^FN?ML[AS]_ZPZZZ_[HSD]")>;
+
 // Floating point reciprocal step
-def : InstRW<[V1Write_4c_1V01], (instregex "^F(N?M(AD|SB)|N?ML[AS])_ZPmZZ_[HSD]$",
-                                           "^FN?ML[AS]_ZPZZZ_[HSD]",
-                                           "^FML[AS]_ZZZI_[HSD]$",
-                                           "^F(RECPS|RSQRTS)_ZZZ_[HSD]$")>;
+def : InstRW<[V1Write_4c_1V01], (instregex "^F(RECPS|RSQRTS)_ZZZ_[HSD]")>;
 
 // Floating point reciprocal estimate, F16
 def : InstRW<[V1Write_6c_4V0], (instrs FRECPE_ZZ_H, FRSQRTE_ZZ_H)>;
@@ -1681,13 +1784,13 @@ def : InstRW<[V1Write_3c_1V01], (instregex "^FEXPA_ZZ_[HSD]$",
 def : InstRW<[V1Write_4c_1V0], (instrs BFCVT_ZPmZ, BFCVTNT_ZPmZ)>;
 
 // Dot product
-def : InstRW<[V1Write_4c_1V01], (instrs BFDOT_ZZI, BFDOT_ZZZ)>;
+def : InstRW<[V1Wr_ZBFDOT, V1Rd_ZBFDOT], (instrs BFDOT_ZZI, BFDOT_ZZZ)>;
 
 // Matrix multiply accumulate
-def : InstRW<[V1Write_5c_1V01], (instrs BFMMLA_ZZZ)>;
+def : InstRW<[V1Wr_ZBFMMA, V1Rd_ZBFMMA], (instrs BFMMLA_ZZZ)>;
 
 // Multiply accumulate long
-def : InstRW<[V1Write_5c_1V01], (instregex "^BFMLAL[BT]_ZZZ(I)?$")>;
+def : InstRW<[V1Wr_ZBFMAL, V1Rd_ZBFMAL], (instregex "^BFMLAL[BT]_ZZZ(I)?$")>;
 
 
 // SVE Load instructions
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-forwarding.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-forwarding.s
new file mode 100644
index 00000000000000..4de37f96000520
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-forwarding.s
@@ -0,0 +1,1421 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=neoverse-v1 -mattr=+sve --instruction-info=0 --resource-pressure=0 --timeline --timeline-max-iterations=2 < %s | FileCheck %s
+
+# LLVM-MCA-BEGIN madd
+mul  x0, x0, x0
+madd x0, x1, x2, x0
+madd x0, x1, x2, x0
+madd x0, x0, x0, x0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN smaddl
+mul    x0, x0, x0
+smaddl x0, w1, w2, x0
+smaddl x0, w1, w2, x0
+smaddl x0, w0, w0, x0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN fmadd
+fadd  d0, d0, d0
+fmadd d0, d1, d2, d0
+fmul  d0, d0, d0
+fmadd d0, d1, d2, d0
+fmadd d0, d1, d2, d0
+fmadd d0, d0, d1, d2
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN saba
+mul  v0.4s, v0.4s, v0.4s
+saba v0.4s, v1.4s, v2.4s
+saba v0.4s, v1.4s, v2.4s
+saba v0.4s, v0.4s, v1.4s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN sadalp
+mul    v0.4s, v0.4s, v0.4s
+sadalp v0.2d, v1.4s
+sadalp v0.2d, v1.4s
+sadalp v0.2d, v0.4s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN sdot
+mul  v0.4s, v0.4s,  v0.4s
+sdot v0.4s, v1.16b, v2.16b
+sdot v0.4s, v1.16b, v2.16b
+sdot v0.4s, v0.16b, v1.16b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN smmla
+mul   v0.4s, v0.4s,  v0.4s
+smmla v0.4s, v1.16b, v2.16b
+smmla v0.4s, v1.16b, v2.16b
+smmla v0.4s, v0.16b, v1.16b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN mla
+mul v0.4s, v0.4s, v0.4s
+mla v0.4s, v1.4s, v2.4s
+mla v0.4s, v1.4s, v2.4s
+mla v0.4s, v0.4s, v1.4s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN smlal2
+mul    v0.4s, v0.4s, v0.4s
+smlal2 v0.4s, v1.8h, v2.8h
+smlal2 v0.4s, v1.8h, v2.8h
+smlal2 v0.4s, v0.8h, v1.8h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN ssra
+mul  v0.4s, v0.4s, v0.4s
+ssra v0.2d, v1.2d, #1
+ssra v0.2d, v1.2d, #1
+ssra v0.2d, v0.2d, #1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN fcmla
+fmul  v0.4s, v0.4s, v0.4s
+fcmla v0.2d, v1.2d, v2.2d, #90
+fcmla v0.2d, v1.2d, v2.2d, #90
+fcmla v0.2d, v0.2d, v1.2d, #90
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN fmla
+fmul v0.2d, v0.2d, v0.2d
+fmla v0.2d, v1.2d, v2.2d
+fadd v0.2d, v0.2d, v0.2d
+fmla v0.2d, v1.2d, v2.2d
+fmla v0.2d, v1.2d, v2.2d
+fmla v0.2d, v0.2d, v1.2d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN fmlal
+fmul  v0.2d, v0.2d, v0.2d
+fmlal v0.4s, v1.4h, v2.4h
+fadd  v0.2d, v0.2d, v0.2d
+fmlal v0.4s, v1.4h, v2.4h
+fmlal v0.4s, v1.4h, v2.4h
+fmlal v0.4s, v0.4h, v1.4h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN bfdot
+fmul  v0.2d, v0.2d, v0.2d
+bfdot v0.4s, v1.8h, v2.8h
+bfdot v0.4s, v1.8h, v2.8h
+bfdot v0.4s, v0.8h, v1.8h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN bfmmla
+fmul   v0.2d, v0.2d, v0.2d
+bfmmla v0.4s, v1.8h, v2.8h
+bfmmla v0.4s, v1.8h, v2.8h
+bfmmla v0.4s, v0.8h, v1.8h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN bfmlalb
+fmul    v0.2d, v0.2d, v0.2d
+bfmlalb v0.4s, v1.8h, v2.8h
+bfmlalb v0.4s, v1.8h, v2.8h
+bfmlalb v0.4s, v0.8h, v1.8h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN crc32cb
+mul    w0, w0, w0
+crc32cb w0, w0, w1
+crc32cb w0, w0, w1
+crc32cb w0, w0, w0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z sdot.s
+mul z0.d, p0/m, z0.d, z0.d
+sdot z0.s, z1.b, z2.b
+sdot z0.s, z1.b, z2.b
+sdot z0.s, z0.b, z1.b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z sudot
+mul z0.d, p0/m, z0.d, z0.d
+sdot z0.s, z1.b, z2.b[1]
+sdot z0.s, z1.b, z2.b[1]
+sdot z0.s, z0.b, z1.b[1]
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z sdot.d
+mul z0.d, p0/m, z0.d, z0.d
+sdot z0.d, z1.h, z2.h
+sdot z0.d, z1.h, z2.h
+sdot z0.d, z0.h, z1.h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z smmla
+mul z0.d, p0/m, z0.d, z0.d
+smmla z0.s, z1.b, z2.b
+smmla z0.s, z1.b, z2.b
+smmla z0.s, z0.b, z1.b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z mla.d
+mul z0.d, p0/m, z0.d, z0.d
+mla z0.d, p0/m, z1.d, z2.d
+mla z0.d, p0/m, z1.d, z2.d
+mla z0.d, p0/m, z0.d, z1.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z mad.d
+mul z0.d, p0/m, z0.d, z0.d
+mad z0.d, p0/m, z1.d, z2.d
+mad z0.d, p0/m, z1.d, z2.d
+mad z0.d, p0/m, z0.d, z1.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z msb.d
+mul z0.d, p0/m, z0.d, z0.d
+msb z0.d, p0/m, z1.d, z2.d
+msb z0.d, p0/m, z1.d, z2.d
+msb z0.d, p0/m, z0.d, z1.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z fcmla ZPmZZ
+fmul  z0.d, z0.d, z0.d
+fcmla z0.d, p0/m, z1.d, z2.d, 90
+fcmla z0.d, p0/m, z1.d, z2.d, 90
+fcmla z0.d, p0/m, z0.d, z1.d, 90
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z fcmla ZZZI
+fmul  z0.d, z0.d, z0.d
+fcmla z0.s, z1.s, z2.s[1], 90
+fcmla z0.s, z1.s, z2.s[1], 90
+fcmla z0.s, z0.s, z1.s[1], 90
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z fmla ZPmZZ
+fmul z0.d, z0.d, z0.d
+fmla z0.d, p0/m, z1.d, z2.d
+fmla z0.d, p0/m, z1.d, z2.d
+fmla z0.d, p0/m, z0.d, z1.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z fmla ZZZI
+fmul z0.d, z0.d, z0.d
+fmla z0.d, z1.d, z2.d[1]
+fmla z0.d, z1.d, z2.d[1]
+fmla z0.d, z0.d, z1.d[1]
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z bfdot
+fmul  z0.d, z0.d, z0.d
+bfdot z0.s, z1.h, z2.h
+bfdot z0.s, z1.h, z2.h
+bfdot z0.s, z0.h, z1.h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z bfmmla
+fmul   z0.d, z0.d, z0.d
+bfmmla z0.s, z1.h, z2.h
+bfmmla z0.s, z1.h, z2.h
+bfmmla z0.s, z0.h, z1.h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN bfmlalb
+fmul    z0.d, z0.d, z0.d
+bfmlalb z0.s, z1.h, z2.h
+bfmlalb z0.s, z1.h, z2.h
+bfmlalb z0.s, z0.h, z1.h
+# LLVM-MCA-END
+
+# CHECK:      [0] Code Region - madd
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      703
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.57
+# CHECK-NEXT: IPC:               0.57
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .    ..   mul	x0, x0, x0
+# CHECK-NEXT: [0,1]     D==eeER   .    ..   madd	x0, x1, x2, x0
+# CHECK-NEXT: [0,2]     D===eeER  .    ..   madd	x0, x1, x2, x0
+# CHECK-NEXT: [0,3]     D=====eeER.    ..   madd	x0, x0, x0, x0
+# CHECK-NEXT: [1,0]     D=======eeER   ..   mul	x0, x0, x0
+# CHECK-NEXT: [1,1]     D=========eeER ..   madd	x0, x1, x2, x0
+# CHECK-NEXT: [1,2]     D==========eeER..   madd	x0, x1, x2, x0
+# CHECK-NEXT: [1,3]     D============eeER   madd	x0, x0, x0, x0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     4.5    0.5    0.0       mul	x0, x0, x0
+# CHECK-NEXT: 1.     2     6.5    0.0    0.0       madd	x0, x1, x2, x0
+# CHECK-NEXT: 2.     2     7.5    0.0    0.0       madd	x0, x1, x2, x0
+# CHECK-NEXT: 3.     2     9.5    0.0    0.0       madd	x0, x0, x0, x0
+# CHECK-NEXT:        2     7.0    0.1    0.0       <total>
+
+# CHECK:      [1] Code Region - smaddl
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      703
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.57
+# CHECK-NEXT: IPC:               0.57
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .    ..   mul	x0, x0, x0
+# CHECK-NEXT: [0,1]     D==eeER   .    ..   smaddl	x0, w1, w2, x0
+# CHECK-NEXT: [0,2]     D===eeER  .    ..   smaddl	x0, w1, w2, x0
+# CHECK-NEXT: [0,3]     D=====eeER.    ..   smaddl	x0, w0, w0, x0
+# CHECK-NEXT: [1,0]     D=======eeER   ..   mul	x0, x0, x0
+# CHECK-NEXT: [1,1]     D=========eeER ..   smaddl	x0, w1, w2, x0
+# CHECK-NEXT: [1,2]     D==========eeER..   smaddl	x0, w1, w2, x0
+# CHECK-NEXT: [1,3]     D============eeER   smaddl	x0, w0, w0, x0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     4.5    0.5    0.0       mul	x0, x0, x0
+# CHECK-NEXT: 1.     2     6.5    0.0    0.0       smaddl	x0, w1, w2, x0
+# CHECK-NEXT: 2.     2     7.5    0.0    0.0       smaddl	x0, w1, w2, x0
+# CHECK-NEXT: 3.     2     9.5    0.0    0.0       smaddl	x0, w0, w0, x0
+# CHECK-NEXT:        2     7.0    0.1    0.0       <total>
+
+# CHECK:      [2] Code Region - fmadd
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      600
+# CHECK-NEXT: Total Cycles:      1703
+# CHECK-NEXT: Total uOps:        600
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.35
+# CHECK-NEXT: IPC:               0.35
+# CHECK-NEXT: Block RThroughput: 1.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeER.    .    .    .    .    .    ..   fadd	d0, d0, d0
+# CHECK-NEXT: [0,1]     D==eeeeER .    .    .    .    .    ..   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [0,2]     D======eeeER   .    .    .    .    ..   fmul	d0, d0, d0
+# CHECK-NEXT: [0,3]     D=======eeeeER .    .    .    .    ..   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [0,4]     D=========eeeeER    .    .    .    ..   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [0,5]     D=============eeeeER.    .    .    ..   fmadd	d0, d0, d1, d2
+# CHECK-NEXT: [1,0]     D=================eeER   .    .    ..   fadd	d0, d0, d0
+# CHECK-NEXT: [1,1]     D===================eeeeER    .    ..   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [1,2]     D=======================eeeER .    ..   fmul	d0, d0, d0
+# CHECK-NEXT: [1,3]     D========================eeeeER    ..   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [1,4]     D==========================eeeeER  ..   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [1,5]     D==============================eeeeER   fmadd	d0, d0, d1, d2
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     9.5    0.5    0.0       fadd	d0, d0, d0
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       fmadd	d0, d1, d2, d0
+# CHECK-NEXT: 2.     2     15.5   0.0    0.0       fmul	d0, d0, d0
+# CHECK-NEXT: 3.     2     16.5   0.0    0.0       fmadd	d0, d1, d2, d0
+# CHECK-NEXT: 4.     2     18.5   0.0    0.0       fmadd	d0, d1, d2, d0
+# CHECK-NEXT: 5.     2     22.5   0.0    0.0       fmadd	d0, d0, d1, d2
+# CHECK-NEXT:        2     15.7   0.1    0.0       <total>
+
+# CHECK:      [3] Code Region - saba
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1303
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
+# CHECK-NEXT: Block RThroughput: 1.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
+
+# CHECK:      [0,0]     DeeeeER   .    .    .    .  .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .  .   saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   saba	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     D=================eeeeER .  .   saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,2]     D==================eeeeER.  .   saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,3]     D======================eeeeER   saba	v0.4s, v0.4s, v1.4s
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 2.     2     12.5   0.0    0.0       saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 3.     2     16.5   0.0    0.0       saba	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT:        2     12.0   0.1    0.0       <total>
+
+# CHECK:      [4] Code Region - sadalp
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1303
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
+# CHECK-NEXT: Block RThroughput: 1.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
+
+# CHECK:      [0,0]     DeeeeER   .    .    .    .  .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .  .   sadalp	v0.2d, v1.4s
+# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   sadalp	v0.2d, v1.4s
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   sadalp	v0.2d, v0.4s
+# CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     D=================eeeeER .  .   sadalp	v0.2d, v1.4s
+# CHECK-NEXT: [1,2]     D==================eeeeER.  .   sadalp	v0.2d, v1.4s
+# CHECK-NEXT: [1,3]     D======================eeeeER   sadalp	v0.2d, v0.4s
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       sadalp	v0.2d, v1.4s
+# CHECK-NEXT: 2.     2     12.5   0.0    0.0       sadalp	v0.2d, v1.4s
+# CHECK-NEXT: 3.     2     16.5   0.0    0.0       sadalp	v0.2d, v0.4s
+# CHECK-NEXT:        2     12.0   0.1    0.0       <total>
+
+# CHECK:      [5] Code Region - sdot
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1103
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.36
+# CHECK-NEXT: IPC:               0.36
+# CHECK-NEXT: Block RThroughput: 0.8
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234
+
+# CHECK:      [0,0]     DeeeeER   .    .    .   .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D====eeeER.    .    .   .   sdot	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [0,2]     D=====eeeER    .    .   .   sdot	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [0,3]     D========eeeER .    .   .   sdot	v0.4s, v0.16b, v1.16b
+# CHECK-NEXT: [1,0]     D===========eeeeER  .   .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     D===============eeeER   .   sdot	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [1,2]     D================eeeER  .   sdot	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [1,3]     D===================eeeER   sdot	v0.4s, v0.16b, v1.16b
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     6.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     10.5   0.0    0.0       sdot	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: 2.     2     11.5   0.0    0.0       sdot	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: 3.     2     14.5   0.0    0.0       sdot	v0.4s, v0.16b, v1.16b
+# CHECK-NEXT:        2     10.8   0.1    0.0       <total>
+
+# CHECK:      [6] Code Region - smmla
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1103
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.36
+# CHECK-NEXT: IPC:               0.36
+# CHECK-NEXT: Block RThroughput: 0.8
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234
+
+# CHECK:      [0,0]     DeeeeER   .    .    .   .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D====eeeER.    .    .   .   smmla	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [0,2]     D=====eeeER    .    .   .   smmla	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [0,3]     D========eeeER .    .   .   smmla	v0.4s, v0.16b, v1.16b
+# CHECK-NEXT: [1,0]     D===========eeeeER  .   .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     D===============eeeER   .   smmla	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [1,2]     D================eeeER  .   smmla	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [1,3]     D===================eeeER   smmla	v0.4s, v0.16b, v1.16b
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     6.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     10.5   0.0    0.0       smmla	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: 2.     2     11.5   0.0    0.0       smmla	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: 3.     2     14.5   0.0    0.0       smmla	v0.4s, v0.16b, v1.16b
+# CHECK-NEXT:        2     10.8   0.1    0.0       <total>
+
+# CHECK:      [7] Code Region - mla
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1303
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
+
+# CHECK:      [0,0]     DeeeeER   .    .    .    .  .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .  .   mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   mla	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     D=================eeeeER .  .   mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,2]     D==================eeeeER.  .   mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,3]     D======================eeeeER   mla	v0.4s, v0.4s, v1.4s
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 2.     2     12.5   0.0    0.0       mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 3.     2     16.5   0.0    0.0       mla	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT:        2     12.0   0.1    0.0       <total>
+
+# CHECK:      [8] Code Region - smlal2
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1303
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
+
+# CHECK:      [0,0]     DeeeeER   .    .    .    .  .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .  .   smlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   smlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   smlal2	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     D=================eeeeER .  .   smlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,2]     D==================eeeeER.  .   smlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3]     D======================eeeeER   smlal2	v0.4s, v0.8h, v1.8h
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       smlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 2.     2     12.5   0.0    0.0       smlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3.     2     16.5   0.0    0.0       smlal2	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT:        2     12.0   0.1    0.0       <total>
+
+# CHECK:      [9] Code Region - ssra
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1303
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
+# CHECK-NEXT: Block RThroughput: 1.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
+
+# CHECK:      [0,0]     DeeeeER   .    .    .    .  .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .  .   ssra	v0.2d, v1.2d, #1
+# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   ssra	v0.2d, v1.2d, #1
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   ssra	v0.2d, v0.2d, #1
+# CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     D=================eeeeER .  .   ssra	v0.2d, v1.2d, #1
+# CHECK-NEXT: [1,2]     D==================eeeeER.  .   ssra	v0.2d, v1.2d, #1
+# CHECK-NEXT: [1,3]     D======================eeeeER   ssra	v0.2d, v0.2d, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       ssra	v0.2d, v1.2d, #1
+# CHECK-NEXT: 2.     2     12.5   0.0    0.0       ssra	v0.2d, v1.2d, #1
+# CHECK-NEXT: 3.     2     16.5   0.0    0.0       ssra	v0.2d, v0.2d, #1
+# CHECK-NEXT:        2     12.0   0.1    0.0       <total>
+
+# CHECK:      [10] Code Region - fcmla
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1303
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .  .   fmul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .  .   fcmla	v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   fcmla	v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   fcmla	v0.2d, v0.2d, v1.2d, #90
+# CHECK-NEXT: [1,0]     D=============eeeER .    .  .   fmul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     D================eeeeER  .  .   fcmla	v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: [1,2]     D==================eeeeER.  .   fcmla	v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: [1,3]     D======================eeeeER   fcmla	v0.2d, v0.2d, v1.2d, #90
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       fmul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     10.5   0.0    0.0       fcmla	v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: 2.     2     12.5   0.0    0.0       fcmla	v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: 3.     2     16.5   0.0    0.0       fcmla	v0.2d, v0.2d, v1.2d, #90
+# CHECK-NEXT:        2     11.8   0.1    0.0       <total>
+
+# CHECK:      [11] Code Region - fmla
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      600
+# CHECK-NEXT: Total Cycles:      1703
+# CHECK-NEXT: Total uOps:        600
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.35
+# CHECK-NEXT: IPC:               0.35
+# CHECK-NEXT: Block RThroughput: 1.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    .    ..   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1]     D=eeeeER  .    .    .    .    .    ..   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [0,2]     D=====eeER.    .    .    .    .    ..   fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,3]     D=======eeeeER .    .    .    .    ..   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [0,4]     D=========eeeeER    .    .    .    ..   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [0,5]     D=============eeeeER.    .    .    ..   fmla	v0.2d, v0.2d, v1.2d
+# CHECK-NEXT: [1,0]     D=================eeeER  .    .    ..   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1]     D==================eeeeER.    .    ..   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,2]     D======================eeER   .    ..   fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,3]     D========================eeeeER    ..   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,4]     D==========================eeeeER  ..   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,5]     D==============================eeeeER   fmla	v0.2d, v0.2d, v1.2d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     9.5    0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1.     2     10.5   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 2.     2     14.5   0.0    0.0       fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 3.     2     16.5   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 4.     2     18.5   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 5.     2     22.5   0.0    0.0       fmla	v0.2d, v0.2d, v1.2d
+# CHECK-NEXT:        2     15.3   0.1    0.0       <total>
+
+# CHECK:      [12] Code Region - fmlal
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      600
+# CHECK-NEXT: Total Cycles:      2203
+# CHECK-NEXT: Total uOps:        600
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.27
+# CHECK-NEXT: IPC:               0.27
+# CHECK-NEXT: Block RThroughput: 1.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123456
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    .    .    .    ..   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    .    .    .    ..   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [0,2]     D========eeER  .    .    .    .    .    .    ..   fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,3]     D==========eeeeeER  .    .    .    .    .    ..   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [0,4]     D============eeeeeER.    .    .    .    .    ..   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [0,5]     D=================eeeeeER.    .    .    .    ..   fmlal	v0.4s, v0.4h, v1.4h
+# CHECK-NEXT: [1,0]     D======================eeeER  .    .    .    ..   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1]     D=========================eeeeeER  .    .    ..   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,2]     D==============================eeER.    .    ..   fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,3]     D================================eeeeeER.    ..   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,4]     D==================================eeeeeER   ..   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,5]     D=======================================eeeeeER   fmlal	v0.4s, v0.4h, v1.4h
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     12.0   0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1.     2     15.0   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 2.     2     20.0   0.0    0.0       fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 3.     2     22.0   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 4.     2     24.0   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 5.     2     29.0   0.0    0.0       fmlal	v0.4s, v0.4h, v1.4h
+# CHECK-NEXT:        2     20.3   0.1    0.0       <total>
+
+# CHECK:      [13] Code Region - bfdot
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1303
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .  .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .  .   bfdot	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   bfdot	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   bfdot	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: [1,0]     D=============eeeER .    .  .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1]     D================eeeeER  .  .   bfdot	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,2]     D==================eeeeER.  .   bfdot	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3]     D======================eeeeER   bfdot	v0.4s, v0.8h, v1.8h
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1.     2     10.5   0.0    0.0       bfdot	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 2.     2     12.5   0.0    0.0       bfdot	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3.     2     16.5   0.0    0.0       bfdot	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT:        2     11.8   0.1    0.0       <total>
+
+# CHECK:      [14] Code Region - bfmmla
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1603
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.25
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    .   .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    .   .   bfmmla	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,2]     D======eeeeeER .    .    .    .   .   bfmmla	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,3]     D===========eeeeeER .    .    .   .   bfmmla	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: [1,0]     D================eeeER   .    .   .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1]     D===================eeeeeER   .   .   bfmmla	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,2]     D======================eeeeeER.   .   bfmmla	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3]     D===========================eeeeeER   bfmmla	v0.4s, v0.8h, v1.8h
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     9.0    0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1.     2     12.0   0.0    0.0       bfmmla	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 2.     2     15.0   0.0    0.0       bfmmla	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3.     2     20.0   0.0    0.0       bfmmla	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT:        2     14.0   0.1    0.0       <total>
+
+# CHECK:      [15] Code Region - bfmlalb
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1303
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .  .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .  .   bfmlalb	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   bfmlalb	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   bfmlalb	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: [1,0]     D=============eeeER .    .  .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1]     D================eeeeER  .  .   bfmlalb	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,2]     D==================eeeeER.  .   bfmlalb	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3]     D======================eeeeER   bfmlalb	v0.4s, v0.8h, v1.8h
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1.     2     10.5   0.0    0.0       bfmlalb	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 2.     2     12.5   0.0    0.0       bfmlalb	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3.     2     16.5   0.0    0.0       bfmlalb	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT:        2     11.8   0.1    0.0       <total>
+
+# CHECK:      [16] Code Region - crc32cb
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      703
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.57
+# CHECK-NEXT: IPC:               0.57
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .    ..   mul	w0, w0, w0
+# CHECK-NEXT: [0,1]     D==eeER   .    ..   crc32cb	w0, w0, w1
+# CHECK-NEXT: [0,2]     D===eeER  .    ..   crc32cb	w0, w0, w1
+# CHECK-NEXT: [0,3]     D=====eeER.    ..   crc32cb	w0, w0, w0
+# CHECK-NEXT: [1,0]     D=======eeER   ..   mul	w0, w0, w0
+# CHECK-NEXT: [1,1]     D=========eeER ..   crc32cb	w0, w0, w1
+# CHECK-NEXT: [1,2]     D==========eeER..   crc32cb	w0, w0, w1
+# CHECK-NEXT: [1,3]     D============eeER   crc32cb	w0, w0, w0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     4.5    0.5    0.0       mul	w0, w0, w0
+# CHECK-NEXT: 1.     2     6.5    0.0    0.0       crc32cb	w0, w0, w1
+# CHECK-NEXT: 2.     2     7.5    0.0    0.0       crc32cb	w0, w0, w1
+# CHECK-NEXT: 3.     2     9.5    0.0    0.0       crc32cb	w0, w0, w0
+# CHECK-NEXT:        2     7.0    0.1    0.0       <total>
+
+# CHECK:      [17] Code Region - Z sdot.s
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1203
+# CHECK-NEXT: Total uOps:        500
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.42
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    ..   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeER    .    .    ..   sdot	z0.s, z1.b, z2.b
+# CHECK-NEXT: [0,2]     D======eeeER   .    .    ..   sdot	z0.s, z1.b, z2.b
+# CHECK-NEXT: [0,3]     D=========eeeER.    .    ..   sdot	z0.s, z0.b, z1.b
+# CHECK-NEXT: [1,0]     D============eeeeeER.    ..   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D=================eeeER  ..   sdot	z0.s, z1.b, z2.b
+# CHECK-NEXT: [1,2]     D==================eeeER ..   sdot	z0.s, z1.b, z2.b
+# CHECK-NEXT: [1,3]     D=====================eeeER   sdot	z0.s, z0.b, z1.b
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     7.0    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: 1.     2     12.0   0.0    0.0       sdot	z0.s, z1.b, z2.b
+# CHECK-NEXT: 2.     2     13.0   0.0    0.0       sdot	z0.s, z1.b, z2.b
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       sdot	z0.s, z0.b, z1.b
+# CHECK-NEXT:        2     12.0   0.1    0.0       <total>
+
+# CHECK:      [18] Code Region - Z sudot
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1203
+# CHECK-NEXT: Total uOps:        500
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.42
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    ..   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeER    .    .    ..   sdot	z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: [0,2]     D======eeeER   .    .    ..   sdot	z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: [0,3]     D=========eeeER.    .    ..   sdot	z0.s, z0.b, z1.b[1]
+# CHECK-NEXT: [1,0]     D============eeeeeER.    ..   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D=================eeeER  ..   sdot	z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: [1,2]     D==================eeeER ..   sdot	z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: [1,3]     D=====================eeeER   sdot	z0.s, z0.b, z1.b[1]
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     7.0    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: 1.     2     12.0   0.0    0.0       sdot	z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: 2.     2     13.0   0.0    0.0       sdot	z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       sdot	z0.s, z0.b, z1.b[1]
+# CHECK-NEXT:        2     12.0   0.1    0.0       <total>
+
+# CHECK:      [19] Code Region - Z sdot.d
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1403
+# CHECK-NEXT: Total uOps:        500
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.36
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .   sdot	z0.d, z1.h, z2.h
+# CHECK-NEXT: [0,2]     D======eeeeER  .    .    .    .   sdot	z0.d, z1.h, z2.h
+# CHECK-NEXT: [0,3]     D==========eeeeER   .    .    .   sdot	z0.d, z0.h, z1.h
+# CHECK-NEXT: [1,0]     D==============eeeeeER   .    .   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D===================eeeeER    .   sdot	z0.d, z1.h, z2.h
+# CHECK-NEXT: [1,2]     D====================eeeeER   .   sdot	z0.d, z1.h, z2.h
+# CHECK-NEXT: [1,3]     D========================eeeeER   sdot	z0.d, z0.h, z1.h
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: 1.     2     13.0   0.0    0.0       sdot	z0.d, z1.h, z2.h
+# CHECK-NEXT: 2.     2     14.0   0.0    0.0       sdot	z0.d, z1.h, z2.h
+# CHECK-NEXT: 3.     2     18.0   0.0    0.0       sdot	z0.d, z0.h, z1.h
+# CHECK-NEXT:        2     13.3   0.1    0.0       <total>
+
+# CHECK:      [20] Code Region - Z smmla
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1203
+# CHECK-NEXT: Total uOps:        500
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.42
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    ..   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeER    .    .    ..   smmla	z0.s, z1.b, z2.b
+# CHECK-NEXT: [0,2]     D======eeeER   .    .    ..   smmla	z0.s, z1.b, z2.b
+# CHECK-NEXT: [0,3]     D=========eeeER.    .    ..   smmla	z0.s, z0.b, z1.b
+# CHECK-NEXT: [1,0]     D============eeeeeER.    ..   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D=================eeeER  ..   smmla	z0.s, z1.b, z2.b
+# CHECK-NEXT: [1,2]     D==================eeeER ..   smmla	z0.s, z1.b, z2.b
+# CHECK-NEXT: [1,3]     D=====================eeeER   smmla	z0.s, z0.b, z1.b
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     7.0    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: 1.     2     12.0   0.0    0.0       smmla	z0.s, z1.b, z2.b
+# CHECK-NEXT: 2.     2     13.0   0.0    0.0       smmla	z0.s, z1.b, z2.b
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       smmla	z0.s, z0.b, z1.b
+# CHECK-NEXT:        2     12.0   0.1    0.0       <total>
+
+# CHECK:      [21] Code Region - Z mla.d
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1703
+# CHECK-NEXT: Total uOps:        800
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.47
+# CHECK-NEXT: IPC:               0.23
+# CHECK-NEXT: Block RThroughput: 8.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    ..   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    ..   mla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [0,2]     D=======eeeeeER.    .    .    .    ..   mla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [0,3]     D============eeeeeER.    .    .    ..   mla	z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT: [1,0]     D=================eeeeeER.    .    ..   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D======================eeeeeER.    ..   mla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,2]     D========================eeeeeER   ..   mla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,3]     .D============================eeeeeER   mla	z0.d, p0/m, z0.d, z1.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: 1.     2     14.5   0.0    0.0       mla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 2.     2     16.5   0.0    0.0       mla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 3.     2     21.0   0.0    0.0       mla	z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT:        2     15.4   0.1    0.0       <total>
+
+# CHECK:      [22] Code Region - Z mad.d
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1703
+# CHECK-NEXT: Total uOps:        800
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.47
+# CHECK-NEXT: IPC:               0.23
+# CHECK-NEXT: Block RThroughput: 8.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    ..   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    ..   mad	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [0,2]     D=======eeeeeER.    .    .    .    ..   mad	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [0,3]     D============eeeeeER.    .    .    ..   mad	z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT: [1,0]     D=================eeeeeER.    .    ..   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D======================eeeeeER.    ..   mad	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,2]     D========================eeeeeER   ..   mad	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,3]     .D============================eeeeeER   mad	z0.d, p0/m, z0.d, z1.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: 1.     2     14.5   0.0    0.0       mad	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 2.     2     16.5   0.0    0.0       mad	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 3.     2     21.0   0.0    0.0       mad	z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT:        2     15.4   0.1    0.0       <total>
+
+# CHECK:      [23] Code Region - Z msb.d
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1703
+# CHECK-NEXT: Total uOps:        800
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.47
+# CHECK-NEXT: IPC:               0.23
+# CHECK-NEXT: Block RThroughput: 8.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    ..   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    ..   msb	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [0,2]     D=======eeeeeER.    .    .    .    ..   msb	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [0,3]     D============eeeeeER.    .    .    ..   msb	z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT: [1,0]     D=================eeeeeER.    .    ..   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D======================eeeeeER.    ..   msb	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,2]     D========================eeeeeER   ..   msb	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,3]     .D============================eeeeeER   msb	z0.d, p0/m, z0.d, z1.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: 1.     2     14.5   0.0    0.0       msb	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 2.     2     16.5   0.0    0.0       msb	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 3.     2     21.0   0.0    0.0       msb	z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT:        2     15.4   0.1    0.0       <total>
+
+# CHECK:      [24] Code Region - Z fcmla ZPmZZ
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1503
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.27
+# CHECK-NEXT: IPC:               0.27
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    . .   fcmla	z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: [0,2]     D=====eeeeeER  .    .    .    . .   fcmla	z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: [0,3]     D==========eeeeeER  .    .    . .   fcmla	z0.d, p0/m, z0.d, z1.d, #90
+# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D==================eeeeeER    . .   fcmla	z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: [1,2]     D====================eeeeeER  . .   fcmla	z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: [1,3]     D=========================eeeeeER   fcmla	z0.d, p0/m, z0.d, z1.d, #90
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       fcmla	z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: 2.     2     13.5   0.0    0.0       fcmla	z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: 3.     2     18.5   0.0    0.0       fcmla	z0.d, p0/m, z0.d, z1.d, #90
+# CHECK-NEXT:        2     13.0   0.1    0.0       <total>
+
+# CHECK:      [25] Code Region - Z fcmla ZZZI
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1503
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.27
+# CHECK-NEXT: IPC:               0.27
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    . .   fcmla	z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: [0,2]     D=====eeeeeER  .    .    .    . .   fcmla	z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: [0,3]     D==========eeeeeER  .    .    . .   fcmla	z0.s, z0.s, z1.s[1], #90
+# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D==================eeeeeER    . .   fcmla	z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: [1,2]     D====================eeeeeER  . .   fcmla	z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: [1,3]     D=========================eeeeeER   fcmla	z0.s, z0.s, z1.s[1], #90
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       fcmla	z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: 2.     2     13.5   0.0    0.0       fcmla	z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: 3.     2     18.5   0.0    0.0       fcmla	z0.s, z0.s, z1.s[1], #90
+# CHECK-NEXT:        2     13.0   0.1    0.0       <total>
+
+# CHECK:      [26] Code Region - Z fmla ZPmZZ
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1303
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .  .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .  .   fmla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   fmla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   fmla	z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT: [1,0]     D=============eeeER .    .  .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D================eeeeER  .  .   fmla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,2]     D==================eeeeER.  .   fmla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,3]     D======================eeeeER   fmla	z0.d, p0/m, z0.d, z1.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     10.5   0.0    0.0       fmla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 2.     2     12.5   0.0    0.0       fmla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 3.     2     16.5   0.0    0.0       fmla	z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT:        2     11.8   0.1    0.0       <total>
+
+# CHECK:      [27] Code Region - Z fmla ZZZI
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1303
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .  .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .  .   fmla	z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   fmla	z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   fmla	z0.d, z0.d, z1.d[1]
+# CHECK-NEXT: [1,0]     D=============eeeER .    .  .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D================eeeeER  .  .   fmla	z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: [1,2]     D==================eeeeER.  .   fmla	z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: [1,3]     D======================eeeeER   fmla	z0.d, z0.d, z1.d[1]
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     10.5   0.0    0.0       fmla	z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: 2.     2     12.5   0.0    0.0       fmla	z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: 3.     2     16.5   0.0    0.0       fmla	z0.d, z0.d, z1.d[1]
+# CHECK-NEXT:        2     11.8   0.1    0.0       <total>
+
+# CHECK:      [28] Code Region - Z bfdot
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1303
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .  .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .  .   bfdot	z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   bfdot	z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   bfdot	z0.s, z0.h, z1.h
+# CHECK-NEXT: [1,0]     D=============eeeER .    .  .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D================eeeeER  .  .   bfdot	z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,2]     D==================eeeeER.  .   bfdot	z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,3]     D======================eeeeER   bfdot	z0.s, z0.h, z1.h
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     10.5   0.0    0.0       bfdot	z0.s, z1.h, z2.h
+# CHECK-NEXT: 2.     2     12.5   0.0    0.0       bfdot	z0.s, z1.h, z2.h
+# CHECK-NEXT: 3.     2     16.5   0.0    0.0       bfdot	z0.s, z0.h, z1.h
+# CHECK-NEXT:        2     11.8   0.1    0.0       <total>
+
+# CHECK:      [29] Code Region - Z bfmmla
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1603
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.25
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    .   .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    .   .   bfmmla	z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,2]     D======eeeeeER .    .    .    .   .   bfmmla	z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,3]     D===========eeeeeER .    .    .   .   bfmmla	z0.s, z0.h, z1.h
+# CHECK-NEXT: [1,0]     D================eeeER   .    .   .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D===================eeeeeER   .   .   bfmmla	z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,2]     D======================eeeeeER.   .   bfmmla	z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,3]     D===========================eeeeeER   bfmmla	z0.s, z0.h, z1.h
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     9.0    0.5    0.0       fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     12.0   0.0    0.0       bfmmla	z0.s, z1.h, z2.h
+# CHECK-NEXT: 2.     2     15.0   0.0    0.0       bfmmla	z0.s, z1.h, z2.h
+# CHECK-NEXT: 3.     2     20.0   0.0    0.0       bfmmla	z0.s, z0.h, z1.h
+# CHECK-NEXT:        2     14.0   0.1    0.0       <total>
+
+# CHECK:      [30] Code Region - bfmlalb
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1503
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.27
+# CHECK-NEXT: IPC:               0.27
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    . .   bfmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,2]     D=====eeeeeER  .    .    .    . .   bfmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,3]     D==========eeeeeER  .    .    . .   bfmlalb	z0.s, z0.h, z1.h
+# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D==================eeeeeER    . .   bfmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,2]     D====================eeeeeER  . .   bfmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,3]     D=========================eeeeeER   bfmlalb	z0.s, z0.h, z1.h
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       bfmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: 2.     2     13.5   0.0    0.0       bfmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: 3.     2     18.5   0.0    0.0       bfmlalb	z0.s, z0.h, z1.h
+# CHECK-NEXT:        2     13.0   0.1    0.0       <total>
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-neon-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-neon-instructions.s
index 1e8df4770d7950..65b73177c7b70a 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-neon-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-neon-instructions.s
@@ -1365,8 +1365,8 @@ zip2 v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      2     0.25                        fcmgt	s10, s11, s12
 # CHECK-NEXT:  1      2     0.25                        fcmgt	v0.4s, v0.4s, #0.0
 # CHECK-NEXT:  1      2     0.25                        fcmgt	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT:  1      2     0.25                        fcmla	v0.2s, v0.2s, v0.2s, #90
-# CHECK-NEXT:  1      2     0.25                        fcmla	v0.4s, v0.4s, v0.s[1], #0
+# CHECK-NEXT:  1      4     0.25                        fcmla	v0.2s, v0.2s, v0.2s, #90
+# CHECK-NEXT:  1      4     0.25                        fcmla	v0.4s, v0.4s, v0.s[1], #0
 # CHECK-NEXT:  1      2     0.25                        fcmle	d20, d21, #0.0
 # CHECK-NEXT:  1      2     0.25                        fcmle	s10, s11, #0.0
 # CHECK-NEXT:  1      2     0.25                        fcmle	v0.2d, v0.2d, #0.0
@@ -1651,7 +1651,7 @@ zip2 v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  7      8     1.00    *                   ld4r	{ v0.2d, v1.2d, v2.2d, v3.2d }, [sp]
 # CHECK-NEXT:  8      8     1.00    *                   ld4r	{ v0.2s, v1.2s, v2.2s, v3.2s }, [sp], #16
 # CHECK-NEXT:  8      8     1.00    *                   ld4r	{ v0.4s, v1.4s, v2.4s, v3.4s }, [sp], x8
-# CHECK-NEXT:  1      2     0.25                        mla	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      4     0.50                        mla	v0.8b, v0.8b, v0.8b
 # CHECK-NEXT:  1      4     0.50                        mls	v0.4h, v0.4h, v0.4h
 # CHECK-NEXT:  1      2     0.25                        mov	b0, v0.b[15]
 # CHECK-NEXT:  1      2     0.25                        mov	d6, v0.d[1]
@@ -1673,7 +1673,7 @@ zip2 v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      2     0.25                        movi	v0.2s, #8, msl #8
 # CHECK-NEXT:  1      2     0.25                        movi	v0.4s, #255, lsl #24
 # CHECK-NEXT:  1      2     0.25                        movi	v0.8b, #255
-# CHECK-NEXT:  1      2     0.25                        mul	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      4     0.50                        mul	v0.8b, v0.8b, v0.8b
 # CHECK-NEXT:  1      2     0.25                        mvni	v0.2s, #0
 # CHECK-NEXT:  1      2     0.25                        mvni	v0.4s, #16, msl #16
 # CHECK-NEXT:  1      2     0.25                        neg	d29, d24
@@ -1780,10 +1780,10 @@ zip2 v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  2      4     1.00                        scvtf	v0.4s, v0.4s
 # CHECK-NEXT:  1      2     0.25                        scvtf	v0.4s, v0.4s, #3
 # CHECK-NEXT:  4      6     1.00                        scvtf	v0.8h, v0.8h
-# CHECK-NEXT:  1      2     0.25                        sdot	v0.2s, v0.8b, v0.4b[2]
-# CHECK-NEXT:  1      2     0.25                        sdot	v0.2s, v0.8b, v0.8b
-# CHECK-NEXT:  1      2     0.25                        sdot	v0.4s, v0.16b, v0.16b
-# CHECK-NEXT:  1      2     0.25                        sdot	v0.4s, v0.16b, v0.4b[2]
+# CHECK-NEXT:  1      3     0.25                        sdot	v0.2s, v0.8b, v0.4b[2]
+# CHECK-NEXT:  1      3     0.25                        sdot	v0.2s, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     0.25                        sdot	v0.4s, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     0.25                        sdot	v0.4s, v0.16b, v0.4b[2]
 # CHECK-NEXT:  1      2     0.25                        shadd	v0.8b, v0.8b, v0.8b
 # CHECK-NEXT:  1      2     0.25                        shl	d7, d10, #12
 # CHECK-NEXT:  1      2     0.50                        shl	v0.16b, v0.16b, #3
@@ -1873,26 +1873,26 @@ zip2 v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      2     0.25                        sqadd	b20, b11, b15
 # CHECK-NEXT:  1      2     0.25                        sqadd	v0.16b, v0.16b, v0.16b
 # CHECK-NEXT:  1      2     0.25                        sqadd	v0.2s, v0.2s, v0.2s
-# CHECK-NEXT:  1      2     0.25                        sqdmlal	d19, s24, s12
+# CHECK-NEXT:  1      4     0.50                        sqdmlal	d19, s24, s12
 # CHECK-NEXT:  1      4     0.50                        sqdmlal	d8, s9, v0.s[1]
 # CHECK-NEXT:  1      4     0.50                        sqdmlal	s0, h0, v0.h[3]
-# CHECK-NEXT:  1      2     0.25                        sqdmlal	s17, h27, h12
+# CHECK-NEXT:  1      4     0.50                        sqdmlal	s17, h27, h12
 # CHECK-NEXT:  1      4     0.50                        sqdmlal	v0.2d, v0.2s, v0.2s
 # CHECK-NEXT:  1      4     0.50                        sqdmlal	v0.4s, v0.4h, v0.4h
 # CHECK-NEXT:  1      4     0.50                        sqdmlal2	v0.2d, v0.4s, v0.4s
 # CHECK-NEXT:  1      4     0.50                        sqdmlal2	v0.4s, v0.8h, v0.8h
-# CHECK-NEXT:  1      2     0.25                        sqdmlsl	d12, s23, s13
+# CHECK-NEXT:  1      4     0.50                        sqdmlsl	d12, s23, s13
 # CHECK-NEXT:  1      4     0.50                        sqdmlsl	d8, s9, v0.s[1]
 # CHECK-NEXT:  1      4     0.50                        sqdmlsl	s0, h0, v0.h[3]
-# CHECK-NEXT:  1      2     0.25                        sqdmlsl	s14, h12, h25
+# CHECK-NEXT:  1      4     0.50                        sqdmlsl	s14, h12, h25
 # CHECK-NEXT:  1      4     0.50                        sqdmlsl	v0.2d, v0.2s, v0.2s
 # CHECK-NEXT:  1      4     0.50                        sqdmlsl	v0.4s, v0.4h, v0.4h
 # CHECK-NEXT:  1      4     0.50                        sqdmlsl2	v0.2d, v0.4s, v0.4s
 # CHECK-NEXT:  1      4     0.50                        sqdmlsl2	v0.4s, v0.8h, v0.8h
-# CHECK-NEXT:  1      2     0.25                        sqdmulh	h10, h11, h12
-# CHECK-NEXT:  1      2     0.25                        sqdmulh	h7, h15, v0.h[3]
-# CHECK-NEXT:  1      2     0.25                        sqdmulh	s15, s14, v0.s[1]
-# CHECK-NEXT:  1      2     0.25                        sqdmulh	s20, s21, s2
+# CHECK-NEXT:  1      4     0.50                        sqdmulh	h10, h11, h12
+# CHECK-NEXT:  1      4     0.50                        sqdmulh	h7, h15, v0.h[3]
+# CHECK-NEXT:  1      4     0.50                        sqdmulh	s15, s14, v0.s[1]
+# CHECK-NEXT:  1      4     0.50                        sqdmulh	s20, s21, s2
 # CHECK-NEXT:  1      4     0.50                        sqdmulh	v0.2s, v0.2s, v0.2s
 # CHECK-NEXT:  1      4     0.50                        sqdmulh	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT:  1      3     0.50                        sqdmull	d1, s1, v0.s[1]
@@ -1914,34 +1914,34 @@ zip2 v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      2     0.25                        sqneg	v0.4s, v0.4s
 # CHECK-NEXT:  1      2     0.25                        sqneg	v0.8b, v0.8b
 # CHECK-NEXT:  1      2     0.25                        sqneg	v0.8h, v0.8h
-# CHECK-NEXT:  1      2     0.25                        sqrdmlah	h0, h1, v2.h[3]
-# CHECK-NEXT:  1      2     0.25                        sqrdmlah	v0.4h, v1.4h, v2.h[3]
-# CHECK-NEXT:  1      2     0.25                        sqrdmlah	v0.8h, v1.8h, v2.h[3]
-# CHECK-NEXT:  1      2     0.25                        sqrdmlah	s0, s1, v2.s[1]
-# CHECK-NEXT:  1      2     0.25                        sqrdmlah	v0.2s, v1.2s, v2.s[1]
-# CHECK-NEXT:  1      2     0.25                        sqrdmlah	v0.4s, v1.4s, v2.s[1]
+# CHECK-NEXT:  1      4     0.50                        sqrdmlah	h0, h1, v2.h[3]
+# CHECK-NEXT:  1      4     0.50                        sqrdmlah	v0.4h, v1.4h, v2.h[3]
+# CHECK-NEXT:  1      4     0.50                        sqrdmlah	v0.8h, v1.8h, v2.h[3]
+# CHECK-NEXT:  1      4     0.50                        sqrdmlah	s0, s1, v2.s[1]
+# CHECK-NEXT:  1      4     0.50                        sqrdmlah	v0.2s, v1.2s, v2.s[1]
+# CHECK-NEXT:  1      4     0.50                        sqrdmlah	v0.4s, v1.4s, v2.s[1]
 # CHECK-NEXT:  1      4     0.50                        sqrdmlah	h0, h1, h2
 # CHECK-NEXT:  1      4     0.50                        sqrdmlah	v0.4h, v1.4h, v2.4h
 # CHECK-NEXT:  1      4     0.50                        sqrdmlah	v0.8h, v1.8h, v2.8h
 # CHECK-NEXT:  1      4     0.50                        sqrdmlah	s0, s1, s2
 # CHECK-NEXT:  1      4     0.50                        sqrdmlah	v0.2s, v1.2s, v2.2s
 # CHECK-NEXT:  1      4     0.50                        sqrdmlah	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT:  1      2     0.25                        sqrdmlsh	h0, h1, v2.h[3]
-# CHECK-NEXT:  1      2     0.25                        sqrdmlsh	v0.4h, v1.4h, v2.h[3]
-# CHECK-NEXT:  1      2     0.25                        sqrdmlsh	v0.8h, v1.8h, v2.h[3]
-# CHECK-NEXT:  1      2     0.25                        sqrdmlsh	s0, s1, v2.s[1]
-# CHECK-NEXT:  1      2     0.25                        sqrdmlsh	v0.2s, v1.2s, v2.s[1]
-# CHECK-NEXT:  1      2     0.25                        sqrdmlsh	v0.4s, v1.4s, v2.s[1]
+# CHECK-NEXT:  1      4     0.50                        sqrdmlsh	h0, h1, v2.h[3]
+# CHECK-NEXT:  1      4     0.50                        sqrdmlsh	v0.4h, v1.4h, v2.h[3]
+# CHECK-NEXT:  1      4     0.50                        sqrdmlsh	v0.8h, v1.8h, v2.h[3]
+# CHECK-NEXT:  1      4     0.50                        sqrdmlsh	s0, s1, v2.s[1]
+# CHECK-NEXT:  1      4     0.50                        sqrdmlsh	v0.2s, v1.2s, v2.s[1]
+# CHECK-NEXT:  1      4     0.50                        sqrdmlsh	v0.4s, v1.4s, v2.s[1]
 # CHECK-NEXT:  1      4     0.50                        sqrdmlsh	h0, h1, h2
 # CHECK-NEXT:  1      4     0.50                        sqrdmlsh	v0.4h, v1.4h, v2.4h
 # CHECK-NEXT:  1      4     0.50                        sqrdmlsh	v0.8h, v1.8h, v2.8h
 # CHECK-NEXT:  1      4     0.50                        sqrdmlsh	s0, s1, s2
 # CHECK-NEXT:  1      4     0.50                        sqrdmlsh	v0.2s, v1.2s, v2.2s
 # CHECK-NEXT:  1      4     0.50                        sqrdmlsh	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT:  1      2     0.25                        sqrdmulh	h10, h11, h12
-# CHECK-NEXT:  1      2     0.25                        sqrdmulh	h7, h15, v0.h[3]
-# CHECK-NEXT:  1      2     0.25                        sqrdmulh	s15, s14, v0.s[1]
-# CHECK-NEXT:  1      2     0.25                        sqrdmulh	s20, s21, s2
+# CHECK-NEXT:  1      4     0.50                        sqrdmulh	h10, h11, h12
+# CHECK-NEXT:  1      4     0.50                        sqrdmulh	h7, h15, v0.h[3]
+# CHECK-NEXT:  1      4     0.50                        sqrdmulh	s15, s14, v0.s[1]
+# CHECK-NEXT:  1      4     0.50                        sqrdmulh	s20, s21, s2
 # CHECK-NEXT:  1      4     0.50                        sqrdmulh	v0.4h, v0.4h, v0.4h
 # CHECK-NEXT:  1      4     0.50                        sqrdmulh	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      4     0.50                        sqrshl	d31, d31, d31
@@ -2124,8 +2124,8 @@ zip2 v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  5      4     1.00           *            st4	{ v0.d, v1.d, v2.d, v3.d }[1], [x0], x5
 # CHECK-NEXT:  1      2     0.25                        sub	d15, d5, d16
 # CHECK-NEXT:  1      2     0.25                        sub	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT:  1      2     0.25                        sudot	v0.2s, v0.8b, v0.4b[2]
-# CHECK-NEXT:  1      2     0.25                        sudot	v0.4s, v0.16b, v0.4b[2]
+# CHECK-NEXT:  1      3     0.25                        sudot	v0.2s, v0.8b, v0.4b[2]
+# CHECK-NEXT:  1      3     0.25                        sudot	v0.4s, v0.16b, v0.4b[2]
 # CHECK-NEXT:  1      2     0.25                        suqadd	b19, b14
 # CHECK-NEXT:  1      2     0.25                        suqadd	d18, d22
 # CHECK-NEXT:  1      2     0.25                        suqadd	h20, h15
@@ -2222,10 +2222,10 @@ zip2 v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  2      4     1.00                        ucvtf	v0.4s, v0.4s
 # CHECK-NEXT:  1      2     0.25                        ucvtf	v0.4s, v0.4s, #3
 # CHECK-NEXT:  4      6     1.00                        ucvtf	v0.8h, v0.8h
-# CHECK-NEXT:  1      2     0.25                        udot	v0.2s, v0.8b, v0.4b[2]
-# CHECK-NEXT:  1      2     0.25                        udot	v0.2s, v0.8b, v0.8b
-# CHECK-NEXT:  1      2     0.25                        udot	v0.4s, v0.16b, v0.16b
-# CHECK-NEXT:  1      2     0.25                        udot	v0.4s, v0.16b, v0.4b[2]
+# CHECK-NEXT:  1      3     0.25                        udot	v0.2s, v0.8b, v0.4b[2]
+# CHECK-NEXT:  1      3     0.25                        udot	v0.2s, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     0.25                        udot	v0.4s, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     0.25                        udot	v0.4s, v0.16b, v0.4b[2]
 # CHECK-NEXT:  1      2     0.25                        uhadd	v0.16b, v0.16b, v0.16b
 # CHECK-NEXT:  1      2     0.25                        uhadd	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      2     0.25                        uhsub	v0.4s, v0.4s, v0.4s
@@ -2356,10 +2356,10 @@ zip2 v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      4     0.50                        ursra	v0.4s, v0.4s, #3
 # CHECK-NEXT:  1      4     0.50                        ursra	v0.8b, v0.8b, #3
 # CHECK-NEXT:  1      4     0.50                        ursra	v0.8h, v0.8h, #3
-# CHECK-NEXT:  1      2     0.25                        usdot	v0.2s, v0.8b, v0.4b[2]
-# CHECK-NEXT:  1      2     0.25                        usdot	v0.2s, v0.8b, v0.8b
-# CHECK-NEXT:  1      2     0.25                        usdot	v0.4s, v0.16b, v0.16b
-# CHECK-NEXT:  1      2     0.25                        usdot	v0.4s, v0.16b, v0.4b[2]
+# CHECK-NEXT:  1      3     0.25                        usdot	v0.2s, v0.8b, v0.4b[2]
+# CHECK-NEXT:  1      3     0.25                        usdot	v0.2s, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     0.25                        usdot	v0.4s, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     0.25                        usdot	v0.4s, v0.16b, v0.4b[2]
 # CHECK-NEXT:  1      2     0.50                        ushl	d0, d0, d0
 # CHECK-NEXT:  1      2     0.50                        ushl	v0.16b, v0.16b, v0.16b
 # CHECK-NEXT:  1      2     0.50                        ushl	v0.4s, v0.4s, v0.4s
@@ -2465,7 +2465,7 @@ zip2 v0.8h, v0.8h, v0.8h
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
-# CHECK-NEXT:  -      -      -      -     26.67  49.17  49.17  18.75  7.75   7.75   7.75   394.50 377.00 349.00 331.50
+# CHECK-NEXT:  -      -      -      -     26.67  49.17  49.17  18.75  7.75   7.75   7.75   401.00 370.50 355.50 325.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
@@ -2892,7 +2892,7 @@ zip2 v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -      -      -     1.00   1.00   1.00   1.00   ld4r	{ v0.2d, v1.2d, v2.2d, v3.2d }, [sp]
 # CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00   0.25   0.25   0.25   0.25   1.00   1.00   1.00   1.00   ld4r	{ v0.2s, v1.2s, v2.2s, v3.2s }, [sp], #16
 # CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00   0.25   0.25   0.25   0.25   1.00   1.00   1.00   1.00   ld4r	{ v0.4s, v1.4s, v2.4s, v3.4s }, [sp], x8
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   mla	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     mla	v0.8b, v0.8b, v0.8b
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     mls	v0.4h, v0.4h, v0.4h
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   mov	b0, v0.b[15]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   mov	d6, v0.d[1]
@@ -2914,7 +2914,7 @@ zip2 v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   movi	v0.2s, #8, msl #8
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   movi	v0.4s, #255, lsl #24
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   movi	v0.8b, #255
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   mul	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     mul	v0.8b, v0.8b, v0.8b
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   mvni	v0.2s, #0
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   mvni	v0.4s, #16, msl #16
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   neg	d29, d24
@@ -3114,26 +3114,26 @@ zip2 v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqadd	b20, b11, b15
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqadd	v0.16b, v0.16b, v0.16b
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqadd	v0.2s, v0.2s, v0.2s
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqdmlal	d19, s24, s12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmlal	d19, s24, s12
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmlal	d8, s9, v0.s[1]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmlal	s0, h0, v0.h[3]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqdmlal	s17, h27, h12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmlal	s17, h27, h12
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmlal	v0.2d, v0.2s, v0.2s
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmlal	v0.4s, v0.4h, v0.4h
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmlal2	v0.2d, v0.4s, v0.4s
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmlal2	v0.4s, v0.8h, v0.8h
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqdmlsl	d12, s23, s13
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmlsl	d12, s23, s13
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmlsl	d8, s9, v0.s[1]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmlsl	s0, h0, v0.h[3]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqdmlsl	s14, h12, h25
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmlsl	s14, h12, h25
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmlsl	v0.2d, v0.2s, v0.2s
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmlsl	v0.4s, v0.4h, v0.4h
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmlsl2	v0.2d, v0.4s, v0.4s
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmlsl2	v0.4s, v0.8h, v0.8h
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqdmulh	h10, h11, h12
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqdmulh	h7, h15, v0.h[3]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqdmulh	s15, s14, v0.s[1]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqdmulh	s20, s21, s2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmulh	h10, h11, h12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmulh	h7, h15, v0.h[3]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmulh	s15, s14, v0.s[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmulh	s20, s21, s2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmulh	v0.2s, v0.2s, v0.2s
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmulh	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmull	d1, s1, v0.s[1]
@@ -3155,34 +3155,34 @@ zip2 v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqneg	v0.4s, v0.4s
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqneg	v0.8b, v0.8b
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqneg	v0.8h, v0.8h
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqrdmlah	h0, h1, v2.h[3]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqrdmlah	v0.4h, v1.4h, v2.h[3]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqrdmlah	v0.8h, v1.8h, v2.h[3]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqrdmlah	s0, s1, v2.s[1]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqrdmlah	v0.2s, v1.2s, v2.s[1]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqrdmlah	v0.4s, v1.4s, v2.s[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlah	h0, h1, v2.h[3]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlah	v0.4h, v1.4h, v2.h[3]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlah	v0.8h, v1.8h, v2.h[3]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlah	s0, s1, v2.s[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlah	v0.2s, v1.2s, v2.s[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlah	v0.4s, v1.4s, v2.s[1]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlah	h0, h1, h2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlah	v0.4h, v1.4h, v2.4h
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlah	v0.8h, v1.8h, v2.8h
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlah	s0, s1, s2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlah	v0.2s, v1.2s, v2.2s
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlah	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqrdmlsh	h0, h1, v2.h[3]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqrdmlsh	v0.4h, v1.4h, v2.h[3]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqrdmlsh	v0.8h, v1.8h, v2.h[3]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqrdmlsh	s0, s1, v2.s[1]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqrdmlsh	v0.2s, v1.2s, v2.s[1]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqrdmlsh	v0.4s, v1.4s, v2.s[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlsh	h0, h1, v2.h[3]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlsh	v0.4h, v1.4h, v2.h[3]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlsh	v0.8h, v1.8h, v2.h[3]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlsh	s0, s1, v2.s[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlsh	v0.2s, v1.2s, v2.s[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlsh	v0.4s, v1.4s, v2.s[1]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlsh	h0, h1, h2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlsh	v0.4h, v1.4h, v2.4h
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlsh	v0.8h, v1.8h, v2.8h
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlsh	s0, s1, s2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlsh	v0.2s, v1.2s, v2.2s
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlsh	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqrdmulh	h10, h11, h12
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqrdmulh	h7, h15, v0.h[3]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqrdmulh	s15, s14, v0.s[1]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqrdmulh	s20, s21, s2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmulh	h10, h11, h12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmulh	h7, h15, v0.h[3]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmulh	s15, s14, v0.s[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmulh	s20, s21, s2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmulh	v0.4h, v0.4h, v0.4h
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmulh	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50   sqrshl	d31, d31, d31

From 90627a5a190a99ae2991d524580d866484aaba16 Mon Sep 17 00:00:00 2001
From: Mikhail Goncharov <goncharov.mikhail@gmail.com>
Date: Fri, 11 Oct 2024 14:01:58 +0200
Subject: [PATCH 157/177] Revert "[XRay] Add support for instrumentation of
 DSOs on x86_64 (#90959)"

This reverts commit a4402039bffd788b9af82435fd5a2fb311fdc6e8 and 4451f9f812d458f6b53785b27869674caf01e67b
---
 clang/include/clang/Basic/CodeGenOptions.def  |   2 -
 clang/include/clang/Driver/Options.td         |   5 -
 clang/include/clang/Driver/XRayArgs.h         |   2 -
 clang/lib/Driver/ToolChains/CommonArgs.cpp    |  12 +-
 clang/lib/Driver/XRayArgs.cpp                 |  21 --
 clang/test/Driver/XRay/xray-shared.cpp        |  17 -
 .../cmake/Modules/AllSupportedArchDefs.cmake  |   1 -
 compiler-rt/cmake/config-ix.cmake             |   4 -
 compiler-rt/include/xray/xray_interface.h     |  65 +---
 compiler-rt/lib/xray/CMakeLists.txt           |  86 +-----
 compiler-rt/lib/xray/xray_dso_init.cpp        |  62 ----
 compiler-rt/lib/xray/xray_init.cpp            | 183 ++---------
 compiler-rt/lib/xray/xray_interface.cpp       | 291 ++++--------------
 .../lib/xray/xray_interface_internal.h        |  83 +----
 compiler-rt/lib/xray/xray_trampoline_x86_64.S |  24 +-
 compiler-rt/lib/xray/xray_x86_64.cpp          |  23 +-
 .../xray/TestCases/Posix/basic-mode-dso.cpp   |  47 ---
 .../TestCases/Posix/clang-xray-shared.cpp     |  14 -
 .../test/xray/TestCases/Posix/dlopen.cpp      | 107 -------
 .../xray/TestCases/Posix/dso-dep-chains.cpp   | 197 ------------
 .../TestCases/Posix/patch-premain-dso.cpp     |  45 ---
 .../Posix/patching-unpatching-dso.cpp         |  75 -----
 22 files changed, 147 insertions(+), 1219 deletions(-)
 delete mode 100644 clang/test/Driver/XRay/xray-shared.cpp
 delete mode 100644 compiler-rt/lib/xray/xray_dso_init.cpp
 delete mode 100644 compiler-rt/test/xray/TestCases/Posix/basic-mode-dso.cpp
 delete mode 100644 compiler-rt/test/xray/TestCases/Posix/clang-xray-shared.cpp
 delete mode 100644 compiler-rt/test/xray/TestCases/Posix/dlopen.cpp
 delete mode 100644 compiler-rt/test/xray/TestCases/Posix/dso-dep-chains.cpp
 delete mode 100644 compiler-rt/test/xray/TestCases/Posix/patch-premain-dso.cpp
 delete mode 100644 compiler-rt/test/xray/TestCases/Posix/patching-unpatching-dso.cpp

diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index e45370bde74a5d..eac831278ee20d 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -136,8 +136,6 @@ CODEGENOPT(XRayIgnoreLoops , 1, 0)
 ///< Emit the XRay function index section.
 CODEGENOPT(XRayFunctionIndex , 1, 1)
 
-///< Set when -fxray-shared is enabled
-CODEGENOPT(XRayShared , 1, 0)
 
 ///< Set the minimum number of instructions in a function to determine selective
 ///< XRay instrumentation.
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 4ee16e213d0e13..d306c751505e98 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2946,11 +2946,6 @@ def fxray_selected_function_group :
   HelpText<"When using -fxray-function-groups, select which group of functions to instrument. Valid range is 0 to fxray-function-groups - 1">,
   MarshallingInfoInt<CodeGenOpts<"XRaySelectedFunctionGroup">, "0">;
 
-defm xray_shared : BoolFOption<"xray-shared",
-  CodeGenOpts<"XRayShared">, DefaultFalse,
-  PosFlag<SetTrue, [], [ClangOption, CC1Option],
-          "Enable shared library instrumentation with XRay">,
-  NegFlag<SetFalse>>;
 
 defm fine_grained_bitfield_accesses : BoolOption<"f", "fine-grained-bitfield-accesses",
   CodeGenOpts<"FineGrainedBitfieldAccesses">, DefaultFalse,
diff --git a/clang/include/clang/Driver/XRayArgs.h b/clang/include/clang/Driver/XRayArgs.h
index 1b5c4a4c42f12a..bdd3d979547eed 100644
--- a/clang/include/clang/Driver/XRayArgs.h
+++ b/clang/include/clang/Driver/XRayArgs.h
@@ -27,7 +27,6 @@ class XRayArgs {
   XRayInstrSet InstrumentationBundle;
   llvm::opt::Arg *XRayInstrument = nullptr;
   bool XRayRT = true;
-  bool XRayShared = false;
 
 public:
   /// Parses the XRay arguments from an argument list.
@@ -36,7 +35,6 @@ class XRayArgs {
                llvm::opt::ArgStringList &CmdArgs, types::ID InputType) const;
 
   bool needsXRayRt() const { return XRayInstrument && XRayRT; }
-  bool needsXRayDSORt() const { return XRayInstrument && XRayRT && XRayShared; }
   llvm::ArrayRef<std::string> modeList() const { return Modes; }
   XRayInstrSet instrumentationBundle() const { return InstrumentationBundle; }
 };
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 0a1b7c209563e8..0c6a585c3acffd 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -1613,14 +1613,10 @@ bool tools::addSanitizerRuntimes(const ToolChain &TC, const ArgList &Args,
 }
 
 bool tools::addXRayRuntime(const ToolChain&TC, const ArgList &Args, ArgStringList &CmdArgs) {
-  if (Args.hasArg(options::OPT_shared)) {
-    if (TC.getXRayArgs().needsXRayDSORt()) {
-      CmdArgs.push_back("--whole-archive");
-      CmdArgs.push_back(TC.getCompilerRTArgString(Args, "xray-dso"));
-      CmdArgs.push_back("--no-whole-archive");
-      return true;
-    }
-  } else if (TC.getXRayArgs().needsXRayRt()) {
+  if (Args.hasArg(options::OPT_shared))
+    return false;
+
+  if (TC.getXRayArgs().needsXRayRt()) {
     CmdArgs.push_back("--whole-archive");
     CmdArgs.push_back(TC.getCompilerRTArgString(Args, "xray"));
     for (const auto &Mode : TC.getXRayArgs().modeList())
diff --git a/clang/lib/Driver/XRayArgs.cpp b/clang/lib/Driver/XRayArgs.cpp
index d0bb5d4887c184..8c5134e2501358 100644
--- a/clang/lib/Driver/XRayArgs.cpp
+++ b/clang/lib/Driver/XRayArgs.cpp
@@ -63,23 +63,6 @@ XRayArgs::XRayArgs(const ToolChain &TC, const ArgList &Args) {
         << XRayInstrument->getSpelling() << Triple.str();
   }
 
-  if (Args.hasFlag(options::OPT_fxray_shared, options::OPT_fno_xray_shared,
-                   false)) {
-    XRayShared = true;
-
-    // DSO instrumentation is currently limited to x86_64
-    if (Triple.getArch() != llvm::Triple::x86_64) {
-      D.Diag(diag::err_drv_unsupported_opt_for_target)
-          << "-fxray-shared" << Triple.str();
-    }
-
-    unsigned PICLvl = std::get<1>(tools::ParsePICArgs(TC, Args));
-    if (!PICLvl) {
-      D.Diag(diag::err_opt_not_valid_without_opt) << "-fxray-shared"
-                                                  << "-fPIC";
-    }
-  }
-
   // Both XRay and -fpatchable-function-entry use
   // TargetOpcode::PATCHABLE_FUNCTION_ENTER.
   if (Arg *A = Args.getLastArg(options::OPT_fpatchable_function_entry_EQ))
@@ -194,10 +177,6 @@ void XRayArgs::addArgs(const ToolChain &TC, const ArgList &Args,
   Args.addOptOutFlag(CmdArgs, options::OPT_fxray_function_index,
                      options::OPT_fno_xray_function_index);
 
-  if (XRayShared)
-    Args.addOptInFlag(CmdArgs, options::OPT_fxray_shared,
-                      options::OPT_fno_xray_shared);
-
   if (const Arg *A =
           Args.getLastArg(options::OPT_fxray_instruction_threshold_EQ)) {
     int Value;
diff --git a/clang/test/Driver/XRay/xray-shared.cpp b/clang/test/Driver/XRay/xray-shared.cpp
deleted file mode 100644
index 215854e1fc7cef..00000000000000
--- a/clang/test/Driver/XRay/xray-shared.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fPIC -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s
-// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fpic -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s
-// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s
-// RUN: not %clang -### --target=x86_64-unknown-linux-gnu -fno-PIC -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR-PIC
-// RUN: not %clang -### --target=x86_64-unknown-linux-gnu -fno-pic -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR-PIC
-
-// On 64 bit darwin, PIC is always enabled
-// RUN: %clang -### --target=x86_64-apple-darwin -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s
-
-// Check unsupported targets
-// RUN: not %clang -### --target=aarch64-pc-freebsd -fPIC -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR-TARGET
-// RUN: not %clang -### --target=arm64-apple-macos -fPIC -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR-TARGET
-
-// CHECK: "-cc1" {{.*}}"-fxray-instrument" {{.*}}"-fxray-shared"
-// ERR-TARGET:   error: unsupported option '-fxray-shared' for target
-// ERR-PIC:   error: option '-fxray-shared' cannot be specified without '-fPIC'
-
diff --git a/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake b/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
index 50a4256b82fe4e..809e9277156912 100644
--- a/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
+++ b/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
@@ -104,7 +104,6 @@ else()
 set(ALL_XRAY_SUPPORTED_ARCH ${X86_64} ${ARM32} ${ARM64} ${MIPS32} ${MIPS64}
 		powerpc64le ${HEXAGON} ${LOONGARCH64})
 endif()
-set(ALL_XRAY_DSO_SUPPORTED_ARCH ${X86_64})
 set(ALL_SHADOWCALLSTACK_SUPPORTED_ARCH ${ARM64})
 
 if (UNIX)
diff --git a/compiler-rt/cmake/config-ix.cmake b/compiler-rt/cmake/config-ix.cmake
index 6134c9876b38e9..a93a88a9205001 100644
--- a/compiler-rt/cmake/config-ix.cmake
+++ b/compiler-rt/cmake/config-ix.cmake
@@ -668,9 +668,6 @@ if(APPLE)
   list_intersect(XRAY_SUPPORTED_ARCH
     ALL_XRAY_SUPPORTED_ARCH
     SANITIZER_COMMON_SUPPORTED_ARCH)
-  list_intersect(XRAY_DSO_SUPPORTED_ARCH
-    ALL_XRAY_DSO_SUPPORTED_ARCH
-    SANITIZER_COMMON_SUPPORTED_ARCH)
   list_intersect(SHADOWCALLSTACK_SUPPORTED_ARCH
     ALL_SHADOWCALLSTACK_SUPPORTED_ARCH
     SANITIZER_COMMON_SUPPORTED_ARCH)
@@ -705,7 +702,6 @@ else()
   filter_available_targets(CFI_SUPPORTED_ARCH ${ALL_CFI_SUPPORTED_ARCH})
   filter_available_targets(SCUDO_STANDALONE_SUPPORTED_ARCH ${ALL_SCUDO_STANDALONE_SUPPORTED_ARCH})
   filter_available_targets(XRAY_SUPPORTED_ARCH ${ALL_XRAY_SUPPORTED_ARCH})
-  filter_available_targets(XRAY_DSO_SUPPORTED_ARCH ${ALL_XRAY_DSO_SUPPORTED_ARCH})
   filter_available_targets(SHADOWCALLSTACK_SUPPORTED_ARCH
     ${ALL_SHADOWCALLSTACK_SUPPORTED_ARCH})
   filter_available_targets(GWP_ASAN_SUPPORTED_ARCH ${ALL_GWP_ASAN_SUPPORTED_ARCH})
diff --git a/compiler-rt/include/xray/xray_interface.h b/compiler-rt/include/xray/xray_interface.h
index 675ea0cbc48c83..727431c04e4f73 100644
--- a/compiler-rt/include/xray/xray_interface.h
+++ b/compiler-rt/include/xray/xray_interface.h
@@ -93,78 +93,31 @@ enum XRayPatchingStatus {
   FAILED = 3,
 };
 
-/// This tells XRay to patch the instrumentation points in all currently loaded
-/// objects. See XRayPatchingStatus for possible result values.
+/// This tells XRay to patch the instrumentation points. See XRayPatchingStatus
+/// for possible result values.
 extern XRayPatchingStatus __xray_patch();
 
-/// This tells XRay to patch the instrumentation points in the given object.
-/// See XRayPatchingStatus for possible result values.
-extern XRayPatchingStatus __xray_patch_object(int32_t ObjId);
-
 /// Reverses the effect of __xray_patch(). See XRayPatchingStatus for possible
 /// result values.
 extern XRayPatchingStatus __xray_unpatch();
 
-/// Reverses the effect of __xray_patch_object. See XRayPatchingStatus for
-/// possible result values.
-extern XRayPatchingStatus __xray_unpatch_object(int32_t ObjId);
-
-/// This unpacks the given (packed) function id and patches
-/// the corresponding function.  See XRayPatchingStatus for possible
+/// This patches a specific function id. See XRayPatchingStatus for possible
 /// result values.
 extern XRayPatchingStatus __xray_patch_function(int32_t FuncId);
 
-/// This patches a specific function in the given object. See XRayPatchingStatus
-/// for possible result values.
-extern XRayPatchingStatus __xray_patch_function_in_object(int32_t FuncId,
-                                                          int32_t ObjId);
-
-/// This unpacks the given (packed) function id and unpatches
-/// the corresponding function. See XRayPatchingStatus for possible
+/// This unpatches a specific function id. See XRayPatchingStatus for possible
 /// result values.
 extern XRayPatchingStatus __xray_unpatch_function(int32_t FuncId);
 
-/// This unpatches a specific function in the given object.
-/// See XRayPatchingStatus for possible result values.
-extern XRayPatchingStatus __xray_unpatch_function_in_object(int32_t FuncId,
-                                                            int32_t ObjId);
-
-/// This function unpacks the given (packed) function id and returns the address
-/// of the corresponding function. We return 0 if we encounter any error, even
-/// if 0 may be a valid function address.
+/// This function returns the address of the function provided a valid function
+/// id. We return 0 if we encounter any error, even if 0 may be a valid function
+/// address.
 extern uintptr_t __xray_function_address(int32_t FuncId);
 
-/// This function returns the address of the function in the given object
-/// provided valid function and object ids. We return 0 if we encounter any
-/// error, even if 0 may be a valid function address.
-extern uintptr_t __xray_function_address_in_object(int32_t FuncId,
-                                                   int32_t ObjId);
-
-/// This function returns the maximum valid function id for the main executable
-/// (object id = 0). Returns 0 if we encounter errors (when there are no
-/// instrumented functions, etc.).
+/// This function returns the maximum valid function id. Returns 0 if we
+/// encounter errors (when there are no instrumented functions, etc.).
 extern size_t __xray_max_function_id();
 
-/// This function returns the maximum valid function id for the given object.
-/// Returns 0 if we encounter errors (when there are no instrumented functions,
-/// etc.).
-extern size_t __xray_max_function_id_in_object(int32_t ObjId);
-
-/// This function returns the number of previously registered objects
-/// (executable + loaded DSOs). Returns 0 if XRay has not been initialized.
-extern size_t __xray_num_objects();
-
-/// Unpacks the function id from the given packed id.
-extern int32_t __xray_unpack_function_id(int32_t PackedId);
-
-/// Unpacks the object id from the given packed id.
-extern int32_t __xray_unpack_object_id(int32_t PackedId);
-
-/// Creates and returns a packed id from the given function and object ids.
-/// If the ids do not fit within the reserved number of bits for each part, the
-/// high bits are truncated.
-extern int32_t __xray_pack_id(int32_t FuncId, int32_t ObjId);
-
 /// Initialize the required XRay data structures. This is useful in cases where
 /// users want to control precisely when the XRay instrumentation data
 /// structures are initialized, for example when the XRay library is built with
diff --git a/compiler-rt/lib/xray/CMakeLists.txt b/compiler-rt/lib/xray/CMakeLists.txt
index f38c07420c9abf..cf7b5062aae32d 100644
--- a/compiler-rt/lib/xray/CMakeLists.txt
+++ b/compiler-rt/lib/xray/CMakeLists.txt
@@ -10,10 +10,6 @@ set(XRAY_SOURCES
   xray_utils.cpp
   )
 
-set(XRAY_DSO_SOURCES
-  xray_dso_init.cpp
-  )
-
 # Implementation files for all XRay modes.
 set(XRAY_FDR_MODE_SOURCES
   xray_fdr_flags.cpp
@@ -37,11 +33,6 @@ set(x86_64_SOURCES
   xray_trampoline_x86_64.S
   )
 
-set(x86_64_DSO_SOURCES
-   xray_trampoline_x86_64.S
-   )
-
-
 set(arm_SOURCES
   xray_arm.cpp
   xray_trampoline_arm.S
@@ -137,12 +128,10 @@ set(XRAY_IMPL_HEADERS
 # consumption by tests.
 set(XRAY_ALL_SOURCE_FILES
   ${XRAY_SOURCES}
-  ${XRAY_DSO_SOURCES}
   ${XRAY_FDR_MODE_SOURCES}
   ${XRAY_BASIC_MODE_SOURCES}
   ${XRAY_PROFILING_MODE_SOURCES}
   ${x86_64_SOURCES}
-  ${x86_64_DSO_SOURCES}
   ${arm_SOURCES}
   ${armhf_SOURCES}
   ${hexagon_SOURCES}
@@ -173,9 +162,6 @@ set(XRAY_CFLAGS
   ${COMPILER_RT_CXX_CFLAGS})
 set(XRAY_COMMON_DEFINITIONS SANITIZER_COMMON_NO_REDEFINE_BUILTINS XRAY_HAS_EXCEPTIONS=1)
 
-# DSO trampolines need to be compiled with GOT addressing
-set(XRAY_COMMON_DEFINITIONS_DSO ${XRAY_COMMON_DEFINITIONS} XRAY_PIC)
-
 # Too many existing bugs, needs cleanup.
 append_list_if(COMPILER_RT_HAS_WNO_FORMAT -Wno-format XRAY_CFLAGS)
 
@@ -215,16 +201,7 @@ if (APPLE)
     CFLAGS ${XRAY_CFLAGS}
     DEFS ${XRAY_COMMON_DEFINITIONS}
     DEPS ${XRAY_DEPS})
-  add_compiler_rt_object_libraries(RTXrayDSO
-    OS ${XRAY_SUPPORTED_OS}
-    ARCHS ${XRAY_DSO_SUPPORTED_ARCH}
-    SOURCES ${XRAY_DSO_SOURCES}
-    ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS}
-    CFLAGS ${XRAY_CFLAGS}
-    DEFS ${XRAY_COMMON_DEFINITIONS_DSO}
-    DEPS ${XRAY_DEPS})
   set(XRAY_RTXRAY_ARCH_LIBS "")
-  set(XRAY_DSO_RTXRAY_ARCH_LIBS "")
   foreach(arch ${XRAY_SUPPORTED_ARCH})
     if(NOT ${arch} IN_LIST XRAY_SOURCE_ARCHS)
       continue()
@@ -238,17 +215,6 @@ if (APPLE)
       DEFS ${XRAY_COMMON_DEFINITIONS}
       DEPS ${XRAY_DEPS})
     list(APPEND XRAY_RTXRAY_ARCH_LIBS RTXray_${arch})
-    if (${arch} IN_LIST XRAY_DSO_SUPPORTED_ARCH)
-      add_compiler_rt_object_libraries(RTXrayDSO_${arch}
-        OS ${XRAY_SUPPORTED_OS}
-        ARCHS ${XRAY_DSO_SUPPORTED_ARCH}
-        SOURCES ${${arch}_DSO_SOURCES}
-        ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS}
-        CFLAGS ${XRAY_CFLAGS}
-        DEFS ${XRAY_COMMON_DEFINITIONS_DSO}
-        DEPS ${XRAY_DEPS})
-      list(APPEND XRAY_DSO_RTXRAY_ARCH_LIBS RTXrayDSO_${arch})
-    endif()
   endforeach()
   add_compiler_rt_object_libraries(RTXrayFDR
     OS ${XRAY_SUPPORTED_OS}
@@ -286,17 +252,6 @@ if (APPLE)
     LINK_FLAGS ${XRAY_LINK_FLAGS} ${WEAK_SYMBOL_LINK_FLAGS}
     LINK_LIBS ${XRAY_LINK_LIBS}
     PARENT_TARGET xray)
-  add_compiler_rt_runtime(clang_rt.xray-dso
-    STATIC
-    OS ${XRAY_SUPPORTED_OS}
-    ARCHS ${XRAY_DSO_SUPPORTED_ARCH}
-    OBJECT_LIBS RTXrayDSO ${XRAY_DSO_RTXRAY_ARCH_LIBS}
-    CFLAGS ${XRAY_CFLAGS}
-    DEFS ${XRAY_COMMON_DEFINITIONS}
-    LINK_FLAGS ${XRAY_LINK_FLAGS} ${WEAK_SYMBOL_LINK_FLAGS}
-    LINK_LIBS ${XRAY_LINK_LIBS}
-    PARENT_TARGET xray)
-
   add_compiler_rt_runtime(clang_rt.xray-fdr
     STATIC
     OS ${XRAY_SUPPORTED_OS}
@@ -391,37 +346,16 @@ else() # not Apple
       DEFS ${XRAY_COMMON_DEFINITIONS}
       OBJECT_LIBS RTXrayBASIC
       PARENT_TARGET xray)
-    # Profiler Mode runtime
-    add_compiler_rt_runtime(clang_rt.xray-profiling
-      STATIC
-      ARCHS ${arch}
-      CFLAGS ${XRAY_CFLAGS}
-      LINK_FLAGS ${XRAY_LINK_FLAGS}
-      LINK_LIBS ${XRAY_LINK_LIBS}
-      DEFS ${XRAY_COMMON_DEFINITIONS}
-      OBJECT_LIBS RTXrayPROFILING
-      PARENT_TARGET xray)
-
-    if (${arch} IN_LIST XRAY_DSO_SUPPORTED_ARCH)
-      # TODO: Only implemented for X86 at the moment
-      add_compiler_rt_object_libraries(RTXrayDSO
-        ARCHS ${arch}
-        SOURCES ${XRAY_DSO_SOURCES} ${${arch}_DSO_SOURCES} 
-        ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS}
-        CFLAGS ${XRAY_CFLAGS}
-        DEFS ${XRAY_COMMON_DEFINITIONS_DSO}
-        DEPS ${XRAY_DEPS})
-      # DSO runtime archive
-      add_compiler_rt_runtime(clang_rt.xray-dso
-        STATIC
-        ARCHS ${arch}
-        CFLAGS ${XRAY_CFLAGS}
-        LINK_FLAGS ${XRAY_LINK_FLAGS}
-        LINK_LIBS ${XRAY_LINK_LIBS}
-        DEFS ${XRAY_COMMON_DEFINITIONS}
-        OBJECT_LIBS RTXrayDSO
-        PARENT_TARGET xray)
-    endif()
+   # Profiler Mode runtime
+   add_compiler_rt_runtime(clang_rt.xray-profiling
+     STATIC
+     ARCHS ${arch}
+     CFLAGS ${XRAY_CFLAGS}
+     LINK_FLAGS ${XRAY_LINK_FLAGS}
+     LINK_LIBS ${XRAY_LINK_LIBS}
+     DEFS ${XRAY_COMMON_DEFINITIONS}
+     OBJECT_LIBS RTXrayPROFILING
+     PARENT_TARGET xray)
   endforeach()
 endif() # not Apple
 
diff --git a/compiler-rt/lib/xray/xray_dso_init.cpp b/compiler-rt/lib/xray/xray_dso_init.cpp
deleted file mode 100644
index eb754db54c64fa..00000000000000
--- a/compiler-rt/lib/xray/xray_dso_init.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-//===-- xray_init.cpp -------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is a part of XRay, a dynamic runtime instrumentation system.
-//
-// XRay initialisation logic for DSOs.
-//===----------------------------------------------------------------------===//
-
-#include "sanitizer_common/sanitizer_atomic.h"
-#include "xray_defs.h"
-#include "xray_flags.h"
-#include "xray_interface_internal.h"
-
-using namespace __sanitizer;
-
-extern "C" {
-extern const XRaySledEntry __start_xray_instr_map[] __attribute__((weak))
-__attribute__((visibility("hidden")));
-extern const XRaySledEntry __stop_xray_instr_map[] __attribute__((weak))
-__attribute__((visibility("hidden")));
-extern const XRayFunctionSledIndex __start_xray_fn_idx[] __attribute__((weak))
-__attribute__((visibility("hidden")));
-extern const XRayFunctionSledIndex __stop_xray_fn_idx[] __attribute__((weak))
-__attribute__((visibility("hidden")));
-
-#if SANITIZER_APPLE
-// HACK: This is a temporary workaround to make XRay build on
-// Darwin, but it will probably not work at runtime.
-extern const XRaySledEntry __start_xray_instr_map[] = {};
-extern const XRaySledEntry __stop_xray_instr_map[] = {};
-extern const XRayFunctionSledIndex __start_xray_fn_idx[] = {};
-extern const XRayFunctionSledIndex __stop_xray_fn_idx[] = {};
-#endif
-}
-
-// Handler functions to call in the patched entry/exit sled.
-extern atomic_uintptr_t XRayPatchedFunction;
-extern atomic_uintptr_t XRayArgLogger;
-extern atomic_uintptr_t XRayPatchedCustomEvent;
-extern atomic_uintptr_t XRayPatchedTypedEvent;
-
-static int __xray_object_id{-1};
-
-// Note: .preinit_array initialization does not work for DSOs
-__attribute__((constructor(0))) static void
-__xray_init_dso() XRAY_NEVER_INSTRUMENT {
-  // Register sleds in main XRay runtime.
-  __xray_object_id =
-      __xray_register_dso(__start_xray_instr_map, __stop_xray_instr_map,
-                          __start_xray_fn_idx, __stop_xray_fn_idx, {});
-}
-
-__attribute__((destructor(0))) static void
-__xray_finalize_dso() XRAY_NEVER_INSTRUMENT {
-  // Inform the main runtime that this DSO is no longer used.
-  __xray_deregister_dso(__xray_object_id);
-}
diff --git a/compiler-rt/lib/xray/xray_init.cpp b/compiler-rt/lib/xray/xray_init.cpp
index 53c93be89cd148..f22a31b95686d0 100644
--- a/compiler-rt/lib/xray/xray_init.cpp
+++ b/compiler-rt/lib/xray/xray_init.cpp
@@ -16,8 +16,6 @@
 #include <unistd.h>
 
 #include "sanitizer_common/sanitizer_common.h"
-#include "xray/xray_interface.h"
-#include "xray_allocator.h"
 #include "xray_defs.h"
 #include "xray_flags.h"
 #include "xray_interface_internal.h"
@@ -30,7 +28,7 @@ extern const XRayFunctionSledIndex __start_xray_fn_idx[] __attribute__((weak));
 extern const XRayFunctionSledIndex __stop_xray_fn_idx[] __attribute__((weak));
 
 #if SANITIZER_APPLE
-// HACK: This is a temporary workaround to make XRay build on
+// HACK: This is a temporary workaround to make XRay build on 
 // Darwin, but it will probably not work at runtime.
 const XRaySledEntry __start_xray_instr_map[] = {};
 extern const XRaySledEntry __stop_xray_instr_map[] = {};
@@ -45,16 +43,14 @@ using namespace __xray;
 // the weak symbols defined above (__start_xray_inst_map and
 // __stop_xray_instr_map) to initialise the instrumentation map that XRay uses
 // for runtime patching/unpatching of instrumentation points.
+//
+// FIXME: Support DSO instrumentation maps too. The current solution only works
+// for statically linked executables.
 atomic_uint8_t XRayInitialized{0};
 
 // This should always be updated before XRayInitialized is updated.
 SpinMutex XRayInstrMapMutex;
-
-//  Contains maps for the main executable as well as DSOs.
-XRaySledMap *XRayInstrMaps;
-
-// Number of binary objects registered.
-atomic_uint32_t XRayNumObjects{0};
+XRaySledMap XRayInstrMap;
 
 // Global flag to determine whether the flags have been initialized.
 atomic_uint8_t XRayFlagsInitialized{0};
@@ -62,63 +58,6 @@ atomic_uint8_t XRayFlagsInitialized{0};
 // A mutex to allow only one thread to initialize the XRay data structures.
 SpinMutex XRayInitMutex;
 
-// Registers XRay sleds and trampolines coming from the main executable or one
-// of the linked DSOs.
-// Returns the object ID if registration is successful, -1 otherwise.
-int32_t
-__xray_register_sleds(const XRaySledEntry *SledsBegin,
-                      const XRaySledEntry *SledsEnd,
-                      const XRayFunctionSledIndex *FnIndexBegin,
-                      const XRayFunctionSledIndex *FnIndexEnd, bool FromDSO,
-                      XRayTrampolines Trampolines) XRAY_NEVER_INSTRUMENT {
-  if (!SledsBegin || !SledsEnd) {
-    Report("Invalid XRay sleds.\n");
-    return -1;
-  }
-  XRaySledMap SledMap;
-  SledMap.FromDSO = FromDSO;
-  SledMap.Loaded = true;
-  SledMap.Trampolines = Trampolines;
-  SledMap.Sleds = SledsBegin;
-  SledMap.Entries = SledsEnd - SledsBegin;
-  if (FnIndexBegin != nullptr) {
-    SledMap.SledsIndex = FnIndexBegin;
-    SledMap.Functions = FnIndexEnd - FnIndexBegin;
-  } else {
-    size_t CountFunctions = 0;
-    uint64_t LastFnAddr = 0;
-
-    for (std::size_t I = 0; I < SledMap.Entries; I++) {
-      const auto &Sled = SledMap.Sleds[I];
-      const auto Function = Sled.function();
-      if (Function != LastFnAddr) {
-        CountFunctions++;
-        LastFnAddr = Function;
-      }
-    }
-    SledMap.SledsIndex = nullptr;
-    SledMap.Functions = CountFunctions;
-  }
-  if (SledMap.Functions >= XRayMaxFunctions) {
-    Report("Too many functions! Maximum is %ld\n", XRayMaxFunctions);
-    return -1;
-  }
-
-  if (Verbosity())
-    Report("Registering %d new functions!\n", SledMap.Functions);
-
-  {
-    SpinMutexLock Guard(&XRayInstrMapMutex);
-    auto Idx = atomic_fetch_add(&XRayNumObjects, 1, memory_order_acq_rel);
-    if (Idx >= XRayMaxObjects) {
-      Report("Too many objects registered! Maximum is %ld\n", XRayMaxObjects);
-      return -1;
-    }
-    XRayInstrMaps[Idx] = std::move(SledMap);
-    return Idx;
-  }
-}
-
 // __xray_init() will do the actual loading of the current process' memory map
 // and then proceed to look for the .xray_instr_map section/segment.
 void __xray_init() XRAY_NEVER_INSTRUMENT {
@@ -141,21 +80,29 @@ void __xray_init() XRAY_NEVER_INSTRUMENT {
     return;
   }
 
-  atomic_store(&XRayNumObjects, 0, memory_order_release);
-
-  // Pre-allocation takes up approx. 5kB for XRayMaxObjects=64.
-  XRayInstrMaps = allocateBuffer<XRaySledMap>(XRayMaxObjects);
-
-  int MainBinaryId =
-      __xray_register_sleds(__start_xray_instr_map, __stop_xray_instr_map,
-                            __start_xray_fn_idx, __stop_xray_fn_idx, false, {});
+  {
+    SpinMutexLock Guard(&XRayInstrMapMutex);
+    XRayInstrMap.Sleds = __start_xray_instr_map;
+    XRayInstrMap.Entries = __stop_xray_instr_map - __start_xray_instr_map;
+    if (__start_xray_fn_idx != nullptr) {
+      XRayInstrMap.SledsIndex = __start_xray_fn_idx;
+      XRayInstrMap.Functions = __stop_xray_fn_idx - __start_xray_fn_idx;
+    } else {
+      size_t CountFunctions = 0;
+      uint64_t LastFnAddr = 0;
+
+      for (std::size_t I = 0; I < XRayInstrMap.Entries; I++) {
+        const auto &Sled = XRayInstrMap.Sleds[I];
+        const auto Function = Sled.function();
+        if (Function != LastFnAddr) {
+          CountFunctions++;
+          LastFnAddr = Function;
+        }
+      }
 
-  // The executable should always get ID 0.
-  if (MainBinaryId != 0) {
-    Report("Registering XRay sleds failed.\n");
-    return;
+      XRayInstrMap.Functions = CountFunctions;
+    }
   }
-
   atomic_store(&XRayInitialized, true, memory_order_release);
 
 #ifndef XRAY_NO_PREINIT
@@ -164,84 +111,6 @@ void __xray_init() XRAY_NEVER_INSTRUMENT {
 #endif
 }
 
-// Registers XRay sleds and trampolines of an instrumented DSO.
-// Returns the object ID if registration is successful, -1 otherwise.
-//
-// Default visibility is hidden, so we have to explicitly make it visible to
-// DSO.
-SANITIZER_INTERFACE_ATTRIBUTE int32_t __xray_register_dso(
-    const XRaySledEntry *SledsBegin, const XRaySledEntry *SledsEnd,
-    const XRayFunctionSledIndex *FnIndexBegin,
-    const XRayFunctionSledIndex *FnIndexEnd,
-    XRayTrampolines Trampolines) XRAY_NEVER_INSTRUMENT {
-  // Make sure XRay has been initialized in the main executable.
-  __xray_init();
-
-  if (__xray_num_objects() == 0) {
-    if (Verbosity())
-      Report("No XRay instrumentation map in main executable. Not initializing "
-             "XRay for DSO.\n");
-    return -1;
-  }
-
-  // Register sleds in global map.
-  int ObjId = __xray_register_sleds(SledsBegin, SledsEnd, FnIndexBegin,
-                                    FnIndexEnd, true, Trampolines);
-
-#ifndef XRAY_NO_PREINIT
-  if (ObjId >= 0 && flags()->patch_premain)
-    __xray_patch_object(ObjId);
-#endif
-
-  return ObjId;
-}
-
-// Deregisters a DSO from the main XRay runtime.
-// Called from the DSO-local runtime when the library is unloaded (e.g. if
-// dlclose is called).
-// Returns true if the object ID is valid and the DSO was successfully
-// deregistered.
-SANITIZER_INTERFACE_ATTRIBUTE bool
-__xray_deregister_dso(int32_t ObjId) XRAY_NEVER_INSTRUMENT {
-
-  if (!atomic_load(&XRayInitialized, memory_order_acquire)) {
-    if (Verbosity())
-      Report("XRay has not been initialized. Cannot deregister DSO.\n");
-    return false;
-  }
-
-  if (ObjId <= 0 || ObjId >= __xray_num_objects()) {
-    if (Verbosity())
-      Report("Can't deregister object with ID %d: ID is invalid.\n", ObjId);
-    return false;
-  }
-
-  {
-    SpinMutexLock Guard(&XRayInstrMapMutex);
-    auto &Entry = XRayInstrMaps[ObjId];
-    if (!Entry.FromDSO) {
-      if (Verbosity())
-        Report("Can't deregister object with ID %d: object does not correspond "
-               "to a shared library.\n",
-               ObjId);
-      return false;
-    }
-    if (!Entry.Loaded) {
-      if (Verbosity())
-        Report("Can't deregister object with ID %d: object is not loaded.\n",
-               ObjId);
-      return true;
-    }
-    // Mark DSO as unloaded. No need to unpatch.
-    Entry.Loaded = false;
-  }
-
-  if (Verbosity())
-    Report("Deregistered object with ID %d.\n", ObjId);
-
-  return true;
-}
-
 // FIXME: Make check-xray tests work on FreeBSD without
 // SANITIZER_CAN_USE_PREINIT_ARRAY.
 // See sanitizer_internal_defs.h where the macro is defined.
diff --git a/compiler-rt/lib/xray/xray_interface.cpp b/compiler-rt/lib/xray/xray_interface.cpp
index 402fc3d07b4e2a..5839043fcb93a8 100644
--- a/compiler-rt/lib/xray/xray_interface.cpp
+++ b/compiler-rt/lib/xray/xray_interface.cpp
@@ -36,8 +36,7 @@
 
 extern __sanitizer::SpinMutex XRayInstrMapMutex;
 extern __sanitizer::atomic_uint8_t XRayInitialized;
-extern __xray::XRaySledMap *XRayInstrMaps;
-extern __sanitizer::atomic_uint32_t XRayNumObjects;
+extern __xray::XRaySledMap XRayInstrMap;
 
 namespace __xray {
 
@@ -62,16 +61,16 @@ static const int16_t cSledLength = 20;
 #endif /* CPU architecture */
 
 // This is the function to call when we encounter the entry or exit sleds.
-atomic_uintptr_t XRayPatchedFunction SANITIZER_INTERFACE_ATTRIBUTE{0};
+atomic_uintptr_t XRayPatchedFunction{0};
 
 // This is the function to call from the arg1-enabled sleds/trampolines.
-atomic_uintptr_t XRayArgLogger SANITIZER_INTERFACE_ATTRIBUTE{0};
+atomic_uintptr_t XRayArgLogger{0};
 
 // This is the function to call when we encounter a custom event log call.
-atomic_uintptr_t XRayPatchedCustomEvent SANITIZER_INTERFACE_ATTRIBUTE{0};
+atomic_uintptr_t XRayPatchedCustomEvent{0};
 
 // This is the function to call when we encounter a typed event log call.
-atomic_uintptr_t XRayPatchedTypedEvent SANITIZER_INTERFACE_ATTRIBUTE{0};
+atomic_uintptr_t XRayPatchedTypedEvent{0};
 
 // This is the global status to determine whether we are currently
 // patching/unpatching.
@@ -151,42 +150,27 @@ class MProtectHelper {
 
 namespace {
 
-bool isObjectLoaded(int32_t ObjId) {
-  SpinMutexLock Guard(&XRayInstrMapMutex);
-  if (ObjId < 0 ||
-      ObjId >= atomic_load(&XRayNumObjects, memory_order_acquire)) {
-    return false;
-  }
-  return XRayInstrMaps[ObjId].Loaded;
-}
-
-bool patchSled(const XRaySledEntry &Sled, bool Enable, int32_t FuncId,
-               const XRayTrampolines &Trampolines) XRAY_NEVER_INSTRUMENT {
+bool patchSled(const XRaySledEntry &Sled, bool Enable,
+               int32_t FuncId) XRAY_NEVER_INSTRUMENT {
   bool Success = false;
   switch (Sled.Kind) {
   case XRayEntryType::ENTRY:
-    Success =
-        patchFunctionEntry(Enable, FuncId, Sled, Trampolines.EntryTrampoline);
+    Success = patchFunctionEntry(Enable, FuncId, Sled, __xray_FunctionEntry);
     break;
   case XRayEntryType::EXIT:
-    Success =
-        patchFunctionExit(Enable, FuncId, Sled, Trampolines.ExitTrampoline);
+    Success = patchFunctionExit(Enable, FuncId, Sled);
     break;
   case XRayEntryType::TAIL:
-    Success = patchFunctionTailExit(Enable, FuncId, Sled,
-                                    Trampolines.TailExitTrampoline);
+    Success = patchFunctionTailExit(Enable, FuncId, Sled);
     break;
   case XRayEntryType::LOG_ARGS_ENTRY:
-    Success =
-        patchFunctionEntry(Enable, FuncId, Sled, Trampolines.LogArgsTrampoline);
+    Success = patchFunctionEntry(Enable, FuncId, Sled, __xray_ArgLoggerEntry);
     break;
   case XRayEntryType::CUSTOM_EVENT:
-    Success = patchCustomEvent(Enable, FuncId, Sled,
-                               Trampolines.CustomEventTrampoline);
+    Success = patchCustomEvent(Enable, FuncId, Sled);
     break;
   case XRayEntryType::TYPED_EVENT:
-    Success =
-        patchTypedEvent(Enable, FuncId, Sled, Trampolines.TypedEventTrampoline);
+    Success = patchTypedEvent(Enable, FuncId, Sled);
     break;
   default:
     Report("Unsupported sled kind '%" PRIu64 "' @%04x\n", Sled.Address,
@@ -221,9 +205,10 @@ findFunctionSleds(int32_t FuncId,
   return Index;
 }
 
-XRayPatchingStatus patchFunction(int32_t FuncId, int32_t ObjId,
+XRayPatchingStatus patchFunction(int32_t FuncId,
                                  bool Enable) XRAY_NEVER_INSTRUMENT {
-  if (!atomic_load(&XRayInitialized, memory_order_acquire))
+  if (!atomic_load(&XRayInitialized,
+                                memory_order_acquire))
     return XRayPatchingStatus::NOT_INITIALIZED; // Not initialized.
 
   uint8_t NotPatching = false;
@@ -235,24 +220,13 @@ XRayPatchingStatus patchFunction(int32_t FuncId, int32_t ObjId,
   XRaySledMap InstrMap;
   {
     SpinMutexLock Guard(&XRayInstrMapMutex);
-    if (ObjId < 0 ||
-        ObjId >= atomic_load(&XRayNumObjects, memory_order_acquire)) {
-      Report("Unable to patch function: invalid sled map index: %d", ObjId);
-      return XRayPatchingStatus::FAILED;
-    }
-    InstrMap = XRayInstrMaps[ObjId];
+    InstrMap = XRayInstrMap;
   }
 
   // If we don't have an index, we can't patch individual functions.
   if (InstrMap.Functions == 0)
     return XRayPatchingStatus::NOT_INITIALIZED;
 
-  // Check if the corresponding DSO has been unloaded.
-  if (!InstrMap.Loaded) {
-    Report("Invalid function id provided: %d\n", FuncId);
-    return XRayPatchingStatus::NOT_INITIALIZED;
-  }
-
   // FuncId must be a positive number, less than the number of functions
   // instrumented.
   if (FuncId <= 0 || static_cast<size_t>(FuncId) > InstrMap.Functions) {
@@ -260,8 +234,6 @@ XRayPatchingStatus patchFunction(int32_t FuncId, int32_t ObjId,
     return XRayPatchingStatus::FAILED;
   }
 
-  auto PackedId = __xray::MakePackedId(FuncId, ObjId);
-
   // Now we patch ths sleds for this specific function.
   XRayFunctionSledIndex SledRange;
   if (InstrMap.SledsIndex) {
@@ -270,13 +242,13 @@ XRayPatchingStatus patchFunction(int32_t FuncId, int32_t ObjId,
   } else {
     SledRange = findFunctionSleds(FuncId, InstrMap);
   }
-
   auto *f = SledRange.Begin;
   bool SucceedOnce = false;
   for (size_t i = 0; i != SledRange.Size; ++i)
-    SucceedOnce |= patchSled(f[i], Enable, PackedId, InstrMap.Trampolines);
+    SucceedOnce |= patchSled(f[i], Enable, FuncId);
 
-  atomic_store(&XRayPatching, false, memory_order_release);
+  atomic_store(&XRayPatching, false,
+                            memory_order_release);
 
   if (!SucceedOnce) {
     Report("Failed patching any sled for function '%d'.", FuncId);
@@ -289,31 +261,32 @@ XRayPatchingStatus patchFunction(int32_t FuncId, int32_t ObjId,
 // controlPatching implements the common internals of the patching/unpatching
 // implementation. |Enable| defines whether we're enabling or disabling the
 // runtime XRay instrumentation.
-// This function should only be called after ensuring that XRay is initialized
-// and no other thread is currently patching.
-XRayPatchingStatus controlPatchingObjectUnchecked(bool Enable, int32_t ObjId) {
+XRayPatchingStatus controlPatching(bool Enable) XRAY_NEVER_INSTRUMENT {
+  if (!atomic_load(&XRayInitialized,
+                                memory_order_acquire))
+    return XRayPatchingStatus::NOT_INITIALIZED; // Not initialized.
+
+  uint8_t NotPatching = false;
+  if (!atomic_compare_exchange_strong(
+          &XRayPatching, &NotPatching, true, memory_order_acq_rel))
+    return XRayPatchingStatus::ONGOING; // Already patching.
+
+  uint8_t PatchingSuccess = false;
+  auto XRayPatchingStatusResetter =
+      at_scope_exit([&PatchingSuccess] {
+        if (!PatchingSuccess)
+          atomic_store(&XRayPatching, false,
+                                    memory_order_release);
+      });
+
   XRaySledMap InstrMap;
   {
     SpinMutexLock Guard(&XRayInstrMapMutex);
-    if (ObjId < 0 ||
-        ObjId >= atomic_load(&XRayNumObjects, memory_order_acquire)) {
-      Report("Unable to patch functions: invalid sled map index: %d\n", ObjId);
-      return XRayPatchingStatus::FAILED;
-    }
-    InstrMap = XRayInstrMaps[ObjId];
+    InstrMap = XRayInstrMap;
   }
   if (InstrMap.Entries == 0)
     return XRayPatchingStatus::NOT_INITIALIZED;
 
-  if (Verbosity())
-    Report("Patching object %d with %d functions.\n", ObjId, InstrMap.Entries);
-
-  // Check if the corresponding DSO has been unloaded.
-  if (!InstrMap.Loaded) {
-    Report("Object is not loaded at index: %d\n", ObjId);
-    return XRayPatchingStatus::FAILED;
-  }
-
   uint32_t FuncId = 1;
   uint64_t CurFun = 0;
 
@@ -363,96 +336,20 @@ XRayPatchingStatus controlPatchingObjectUnchecked(bool Enable, int32_t ObjId) {
       ++FuncId;
       CurFun = F;
     }
-    auto PackedId = __xray::MakePackedId(FuncId, ObjId);
-    patchSled(Sled, Enable, PackedId, InstrMap.Trampolines);
+    patchSled(Sled, Enable, FuncId);
   }
-  atomic_store(&XRayPatching, false, memory_order_release);
+  atomic_store(&XRayPatching, false,
+                            memory_order_release);
+  PatchingSuccess = true;
   return XRayPatchingStatus::SUCCESS;
 }
 
-// Controls patching for all registered objects.
-// Returns: SUCCESS, if patching succeeds for all objects.
-//          NOT_INITIALIZED, if one or more objects returned NOT_INITIALIZED
-//             but none failed.
-//          FAILED, if patching of one or more objects failed.
-XRayPatchingStatus controlPatching(bool Enable) XRAY_NEVER_INSTRUMENT {
-  if (!atomic_load(&XRayInitialized, memory_order_acquire))
-    return XRayPatchingStatus::NOT_INITIALIZED; // Not initialized.
-
-  uint8_t NotPatching = false;
-  if (!atomic_compare_exchange_strong(&XRayPatching, &NotPatching, true,
-                                      memory_order_acq_rel))
-    return XRayPatchingStatus::ONGOING; // Already patching.
-
-  auto XRayPatchingStatusResetter = at_scope_exit(
-      [] { atomic_store(&XRayPatching, false, memory_order_release); });
-
-  unsigned NumObjects = __xray_num_objects();
-
-  XRayPatchingStatus CombinedStatus{NOT_INITIALIZED};
-  for (unsigned I = 0; I < NumObjects; ++I) {
-    if (!isObjectLoaded(I))
-      continue;
-    auto LastStatus = controlPatchingObjectUnchecked(Enable, I);
-    switch (LastStatus) {
-    case SUCCESS:
-      if (CombinedStatus == NOT_INITIALIZED)
-        CombinedStatus = SUCCESS;
-      break;
-    case FAILED:
-      // Report failure, but try to patch the remaining objects
-      CombinedStatus = FAILED;
-      break;
-    case NOT_INITIALIZED:
-      // XRay has been initialized but there are no sleds available for this
-      // object. Try to patch remaining objects.
-      if (CombinedStatus != FAILED)
-        CombinedStatus = NOT_INITIALIZED;
-      break;
-    case ONGOING:
-      UNREACHABLE("Status ONGOING should not appear at this point");
-    default:
-      UNREACHABLE("Unhandled patching status");
-    }
-  }
-  return CombinedStatus;
-}
-
-// Controls patching for one object.
-XRayPatchingStatus controlPatching(bool Enable,
-                                   int32_t ObjId) XRAY_NEVER_INSTRUMENT {
-
-  if (!atomic_load(&XRayInitialized, memory_order_acquire))
-    return XRayPatchingStatus::NOT_INITIALIZED; // Not initialized.
-
-  uint8_t NotPatching = false;
-  if (!atomic_compare_exchange_strong(&XRayPatching, &NotPatching, true,
-                                      memory_order_acq_rel))
-    return XRayPatchingStatus::ONGOING; // Already patching.
-
-  auto XRayPatchingStatusResetter = at_scope_exit(
-      [] { atomic_store(&XRayPatching, false, memory_order_release); });
-
-  return controlPatchingObjectUnchecked(Enable, ObjId);
-}
-
-XRayPatchingStatus mprotectAndPatchFunction(int32_t FuncId, int32_t ObjId,
+XRayPatchingStatus mprotectAndPatchFunction(int32_t FuncId,
                                             bool Enable) XRAY_NEVER_INSTRUMENT {
   XRaySledMap InstrMap;
   {
     SpinMutexLock Guard(&XRayInstrMapMutex);
-    if (ObjId < 0 ||
-        ObjId >= atomic_load(&XRayNumObjects, memory_order_acquire)) {
-      Report("Unable to patch function: invalid sled map index: %d\n", ObjId);
-      return XRayPatchingStatus::FAILED;
-    }
-    InstrMap = XRayInstrMaps[ObjId];
-  }
-
-  // Check if the corresponding DSO has been unloaded.
-  if (!InstrMap.Loaded) {
-    Report("Object is not loaded at index: %d\n", ObjId);
-    return XRayPatchingStatus::FAILED;
+    InstrMap = XRayInstrMap;
   }
 
   // FuncId must be a positive number, less than the number of functions
@@ -501,7 +398,7 @@ XRayPatchingStatus mprotectAndPatchFunction(int32_t FuncId, int32_t ObjId,
     Report("Failed mprotect: %d\n", errno);
     return XRayPatchingStatus::FAILED;
   }
-  return patchFunction(FuncId, ObjId, Enable);
+  return patchFunction(FuncId, Enable);
 }
 
 } // namespace
@@ -515,10 +412,12 @@ using namespace __xray;
 
 int __xray_set_handler(void (*entry)(int32_t,
                                      XRayEntryType)) XRAY_NEVER_INSTRUMENT {
-  if (atomic_load(&XRayInitialized, memory_order_acquire)) {
+  if (atomic_load(&XRayInitialized,
+                               memory_order_acquire)) {
 
     atomic_store(&__xray::XRayPatchedFunction,
-                 reinterpret_cast<uintptr_t>(entry), memory_order_release);
+                              reinterpret_cast<uintptr_t>(entry),
+                              memory_order_release);
     return 1;
   }
   return 0;
@@ -526,9 +425,11 @@ int __xray_set_handler(void (*entry)(int32_t,
 
 int __xray_set_customevent_handler(void (*entry)(void *, size_t))
     XRAY_NEVER_INSTRUMENT {
-  if (atomic_load(&XRayInitialized, memory_order_acquire)) {
+  if (atomic_load(&XRayInitialized,
+                               memory_order_acquire)) {
     atomic_store(&__xray::XRayPatchedCustomEvent,
-                 reinterpret_cast<uintptr_t>(entry), memory_order_release);
+                              reinterpret_cast<uintptr_t>(entry),
+                              memory_order_release);
     return 1;
   }
   return 0;
@@ -536,9 +437,11 @@ int __xray_set_customevent_handler(void (*entry)(void *, size_t))
 
 int __xray_set_typedevent_handler(void (*entry)(size_t, const void *,
                                                 size_t)) XRAY_NEVER_INSTRUMENT {
-  if (atomic_load(&XRayInitialized, memory_order_acquire)) {
+  if (atomic_load(&XRayInitialized,
+                               memory_order_acquire)) {
     atomic_store(&__xray::XRayPatchedTypedEvent,
-                 reinterpret_cast<uintptr_t>(entry), memory_order_release);
+                              reinterpret_cast<uintptr_t>(entry),
+                              memory_order_release);
     return 1;
   }
   return 0;
@@ -571,78 +474,39 @@ XRayPatchingStatus __xray_patch() XRAY_NEVER_INSTRUMENT {
   return controlPatching(true);
 }
 
-XRayPatchingStatus __xray_patch_object(int32_t ObjId) XRAY_NEVER_INSTRUMENT {
-  return controlPatching(true, ObjId);
-}
-
 XRayPatchingStatus __xray_unpatch() XRAY_NEVER_INSTRUMENT {
   return controlPatching(false);
 }
 
-XRayPatchingStatus __xray_unpatch_object(int32_t ObjId) XRAY_NEVER_INSTRUMENT {
-  return controlPatching(false, ObjId);
-}
-
 XRayPatchingStatus __xray_patch_function(int32_t FuncId) XRAY_NEVER_INSTRUMENT {
-  auto Ids = __xray::UnpackId(FuncId);
-  auto ObjId = Ids.first;
-  auto FnId = Ids.second;
-  return mprotectAndPatchFunction(FnId, ObjId, true);
-}
-
-XRayPatchingStatus
-__xray_patch_function_in_object(int32_t FuncId,
-                                int32_t ObjId) XRAY_NEVER_INSTRUMENT {
-  return mprotectAndPatchFunction(FuncId, ObjId, true);
+  return mprotectAndPatchFunction(FuncId, true);
 }
 
 XRayPatchingStatus
 __xray_unpatch_function(int32_t FuncId) XRAY_NEVER_INSTRUMENT {
-  auto Ids = __xray::UnpackId(FuncId);
-  auto ObjId = Ids.first;
-  auto FnId = Ids.second;
-  return mprotectAndPatchFunction(FnId, ObjId, false);
-}
-
-XRayPatchingStatus
-__xray_unpatch_function_in_object(int32_t FuncId,
-                                  int32_t ObjId) XRAY_NEVER_INSTRUMENT {
-  return mprotectAndPatchFunction(FuncId, ObjId, false);
+  return mprotectAndPatchFunction(FuncId, false);
 }
 
 int __xray_set_handler_arg1(void (*entry)(int32_t, XRayEntryType, uint64_t)) {
-  if (!atomic_load(&XRayInitialized, memory_order_acquire))
+  if (!atomic_load(&XRayInitialized,
+                                memory_order_acquire))
     return 0;
 
   // A relaxed write might not be visible even if the current thread gets
   // scheduled on a different CPU/NUMA node.  We need to wait for everyone to
   // have this handler installed for consistency of collected data across CPUs.
   atomic_store(&XRayArgLogger, reinterpret_cast<uint64_t>(entry),
-               memory_order_release);
+                            memory_order_release);
   return 1;
 }
 
 int __xray_remove_handler_arg1() { return __xray_set_handler_arg1(nullptr); }
 
-uintptr_t
-__xray_function_address(int32_t CombinedFuncId) XRAY_NEVER_INSTRUMENT {
-  auto Ids = __xray::UnpackId(CombinedFuncId);
-  return __xray_function_address_in_object(Ids.second, Ids.first);
-}
-
-uintptr_t __xray_function_address_in_object(int32_t FuncId, int32_t ObjId)
-    XRAY_NEVER_INSTRUMENT {
+uintptr_t __xray_function_address(int32_t FuncId) XRAY_NEVER_INSTRUMENT {
   XRaySledMap InstrMap;
   {
     SpinMutexLock Guard(&XRayInstrMapMutex);
-    auto count = atomic_load(&XRayNumObjects, memory_order_acquire);
-    if (ObjId < 0 || ObjId >= count) {
-      Report("Unable to determine function address: invalid sled map index %d "
-             "(size is %d)\n",
-             ObjId, (int)count);
-      return 0;
-    }
-    InstrMap = XRayInstrMaps[ObjId];
+    InstrMap = XRayInstrMap;
   }
 
   if (FuncId <= 0 || static_cast<size_t>(FuncId) > InstrMap.Functions)
@@ -661,29 +525,6 @@ uintptr_t __xray_function_address_in_object(int32_t FuncId, int32_t ObjId)
 }
 
 size_t __xray_max_function_id() XRAY_NEVER_INSTRUMENT {
-  return __xray_max_function_id_in_object(0);
-}
-
-size_t __xray_max_function_id_in_object(int32_t ObjId) XRAY_NEVER_INSTRUMENT {
-  SpinMutexLock Guard(&XRayInstrMapMutex);
-  if (ObjId < 0 || ObjId >= atomic_load(&XRayNumObjects, memory_order_acquire))
-    return 0;
-  return XRayInstrMaps[ObjId].Functions;
-}
-
-size_t __xray_num_objects() XRAY_NEVER_INSTRUMENT {
   SpinMutexLock Guard(&XRayInstrMapMutex);
-  return atomic_load(&XRayNumObjects, memory_order_acquire);
-}
-
-int32_t __xray_unpack_function_id(int32_t PackedId) {
-  return __xray::UnpackId(PackedId).second;
-}
-
-int32_t __xray_unpack_object_id(int32_t PackedId) {
-  return __xray::UnpackId(PackedId).first;
-}
-
-int32_t __xray_pack_id(int32_t FuncId, int32_t ObjId) {
-  return __xray::MakePackedId(FuncId, ObjId);
+  return XRayInstrMap.Functions;
 }
diff --git a/compiler-rt/lib/xray/xray_interface_internal.h b/compiler-rt/lib/xray/xray_interface_internal.h
index 5fbaa9c3f315b1..80c07c167f6461 100644
--- a/compiler-rt/lib/xray/xray_interface_internal.h
+++ b/compiler-rt/lib/xray/xray_interface_internal.h
@@ -18,18 +18,6 @@
 #include "xray/xray_interface.h"
 #include <cstddef>
 #include <cstdint>
-#include <utility>
-
-extern "C" {
-// The following functions have to be defined in assembler, on a per-platform
-// basis. See xray_trampoline_*.S files for implementations.
-extern void __xray_FunctionEntry();
-extern void __xray_FunctionExit();
-extern void __xray_FunctionTailExit();
-extern void __xray_ArgLoggerEntry();
-extern void __xray_CustomEvent();
-extern void __xray_TypedEvent();
-}
 
 extern "C" {
 
@@ -79,77 +67,36 @@ struct XRayFunctionSledIndex {
                                                    uintptr_t(Begin));
   }
 };
-
-struct XRayTrampolines {
-  void (*EntryTrampoline)();
-  void (*ExitTrampoline)();
-  void (*TailExitTrampoline)();
-  void (*LogArgsTrampoline)();
-  void (*CustomEventTrampoline)();
-  void (*TypedEventTrampoline)();
-
-  XRayTrampolines() {
-    // These resolve to the definitions in the respective executable or DSO.
-    EntryTrampoline = __xray_FunctionEntry;
-    ExitTrampoline = __xray_FunctionExit;
-    TailExitTrampoline = __xray_FunctionTailExit;
-    LogArgsTrampoline = __xray_ArgLoggerEntry;
-    CustomEventTrampoline = __xray_CustomEvent;
-    TypedEventTrampoline = __xray_TypedEvent;
-  }
-};
-
-extern int32_t __xray_register_dso(const XRaySledEntry *SledsBegin,
-                                   const XRaySledEntry *SledsEnd,
-                                   const XRayFunctionSledIndex *FnIndexBegin,
-                                   const XRayFunctionSledIndex *FnIndexEnd,
-                                   XRayTrampolines Trampolines);
-
-extern bool __xray_deregister_dso(int32_t ObjId);
 }
 
 namespace __xray {
 
-constexpr uint32_t XRayNFnBits = 24;
-constexpr uint32_t XRayNObjBits = 8;
-
-constexpr uint32_t XRayFnBitMask = 0x00FFFFFF;
-constexpr uint32_t XRayObjBitMask = 0xFF000000;
-
-constexpr size_t XRayMaxFunctions = 1 << XRayNFnBits;
-constexpr size_t XRayMaxObjects = 1 << XRayNObjBits;
-
-inline int32_t MakePackedId(int32_t FnId, int32_t ObjId) {
-  return ((ObjId << XRayNFnBits) & XRayObjBitMask) | (FnId & XRayFnBitMask);
-}
-
-inline std::pair<int32_t, int32_t> UnpackId(int32_t PackedId) {
-  uint32_t ObjId = (PackedId & XRayObjBitMask) >> XRayNFnBits;
-  uint32_t FnId = PackedId & XRayFnBitMask;
-  return {ObjId, FnId};
-}
-
 struct XRaySledMap {
   const XRaySledEntry *Sleds;
   size_t Entries;
   const XRayFunctionSledIndex *SledsIndex;
   size_t Functions;
-  XRayTrampolines Trampolines;
-  bool FromDSO;
-  bool Loaded;
 };
 
 bool patchFunctionEntry(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled,
                         void (*Trampoline)());
-bool patchFunctionExit(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled,
-                       void (*Trampoline)());
+bool patchFunctionExit(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled);
 bool patchFunctionTailExit(bool Enable, uint32_t FuncId,
-                           const XRaySledEntry &Sled, void (*Trampoline)());
-bool patchCustomEvent(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled,
-                      void (*Trampoline)());
-bool patchTypedEvent(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled,
-                     void (*Trampoline)());
+                           const XRaySledEntry &Sled);
+bool patchCustomEvent(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled);
+bool patchTypedEvent(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled);
 
 } // namespace __xray
 
+extern "C" {
+// The following functions have to be defined in assembler, on a per-platform
+// basis. See xray_trampoline_*.S files for implementations.
+extern void __xray_FunctionEntry();
+extern void __xray_FunctionExit();
+extern void __xray_FunctionTailExit();
+extern void __xray_ArgLoggerEntry();
+extern void __xray_CustomEvent();
+extern void __xray_TypedEvent();
+}
+
 #endif
diff --git a/compiler-rt/lib/xray/xray_trampoline_x86_64.S b/compiler-rt/lib/xray/xray_trampoline_x86_64.S
index 0f480547b52cc6..01098f60eeab8b 100644
--- a/compiler-rt/lib/xray/xray_trampoline_x86_64.S
+++ b/compiler-rt/lib/xray/xray_trampoline_x86_64.S
@@ -107,16 +107,6 @@
 	.section __TEXT,__text
 #endif
 
-.macro LOAD_HANDLER_ADDR handler
-#if !defined(XRAY_PIC)
-	movq	ASM_SYMBOL(\handler)(%rip), %rax
-#else
-	movq	ASM_SYMBOL(\handler)@GOTPCREL(%rip), %rax
-	movq	(%rax), %rax
-#endif
-.endm
-
-
 //===----------------------------------------------------------------------===//
 
 	.globl ASM_SYMBOL(__xray_FunctionEntry)
@@ -131,7 +121,7 @@ ASM_SYMBOL(__xray_FunctionEntry):
 
 	// This load has to be atomic, it's concurrent with __xray_patch().
 	// On x86/amd64, a simple (type-aligned) MOV instruction is enough.
-	LOAD_HANDLER_ADDR _ZN6__xray19XRayPatchedFunctionE
+	movq	ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)(%rip), %rax
 	testq	%rax, %rax
 	je	LOCAL_LABEL(tmp0)
 
@@ -169,7 +159,7 @@ ASM_SYMBOL(__xray_FunctionExit):
 	movupd	%xmm1, 16(%rsp)
 	movq	%rax, 8(%rsp)
 	movq	%rdx, 0(%rsp)
-	LOAD_HANDLER_ADDR _ZN6__xray19XRayPatchedFunctionE
+	movq	ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)(%rip), %rax
 	testq %rax,%rax
 	je	LOCAL_LABEL(tmp2)
 
@@ -205,7 +195,7 @@ ASM_SYMBOL(__xray_FunctionTailExit):
 	SAVE_REGISTERS
 	ALIGN_STACK_16B
 
-	LOAD_HANDLER_ADDR _ZN6__xray19XRayPatchedFunctionE
+	movq	ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)(%rip), %rax
 	testq %rax,%rax
 	je	LOCAL_LABEL(tmp4)
 
@@ -234,12 +224,12 @@ ASM_SYMBOL(__xray_ArgLoggerEntry):
 	ALIGN_STACK_16B
 
 	// Again, these function pointer loads must be atomic; MOV is fine.
-	LOAD_HANDLER_ADDR _ZN6__xray13XRayArgLoggerE
+	movq	ASM_SYMBOL(_ZN6__xray13XRayArgLoggerE)(%rip), %rax
 	testq	%rax, %rax
 	jne	LOCAL_LABEL(arg1entryLog)
 
 	// If [arg1 logging handler] not set, defer to no-arg logging.
-	LOAD_HANDLER_ADDR _ZN6__xray19XRayPatchedFunctionE
+	movq	ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)(%rip), %rax
 	testq	%rax, %rax
 	je	LOCAL_LABEL(arg1entryFail)
 
@@ -278,7 +268,7 @@ ASM_SYMBOL(__xray_CustomEvent):
 
 	// We take two arguments to this trampoline, which should be in rdi	and rsi
 	// already.
-	LOAD_HANDLER_ADDR _ZN6__xray22XRayPatchedCustomEventE
+	movq ASM_SYMBOL(_ZN6__xray22XRayPatchedCustomEventE)(%rip), %rax
 	testq %rax,%rax
 	je LOCAL_LABEL(customEventCleanup)
 
@@ -303,7 +293,7 @@ ASM_SYMBOL(__xray_TypedEvent):
 
 	// We pass three arguments to this trampoline, which should be in rdi, rsi
 	// and rdx without our intervention.
-	LOAD_HANDLER_ADDR _ZN6__xray21XRayPatchedTypedEventE
+	movq ASM_SYMBOL(_ZN6__xray21XRayPatchedTypedEventE)(%rip), %rax
 	testq %rax,%rax
 	je LOCAL_LABEL(typedEventCleanup)
 
diff --git a/compiler-rt/lib/xray/xray_x86_64.cpp b/compiler-rt/lib/xray/xray_x86_64.cpp
index 663a51b2686614..b9666a40861d48 100644
--- a/compiler-rt/lib/xray/xray_x86_64.cpp
+++ b/compiler-rt/lib/xray/xray_x86_64.cpp
@@ -170,8 +170,7 @@ bool patchFunctionEntry(const bool Enable, const uint32_t FuncId,
 }
 
 bool patchFunctionExit(const bool Enable, const uint32_t FuncId,
-                       const XRaySledEntry &Sled,
-                       void (*Trampoline)()) XRAY_NEVER_INSTRUMENT {
+                       const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
   // Here we do the dance of replacing the following sled:
   //
   // xray_sled_n:
@@ -193,11 +192,11 @@ bool patchFunctionExit(const bool Enable, const uint32_t FuncId,
   // Prerequisite is to compute the relative offset fo the
   // __xray_FunctionExit function's address.
   const uint64_t Address = Sled.address();
-  int64_t TrampolineOffset = reinterpret_cast<int64_t>(Trampoline) -
+  int64_t TrampolineOffset = reinterpret_cast<int64_t>(__xray_FunctionExit) -
                              (static_cast<int64_t>(Address) + 11);
   if (TrampolineOffset < MinOffset || TrampolineOffset > MaxOffset) {
     Report("XRay Exit trampoline (%p) too far from sled (%p)\n",
-           reinterpret_cast<void *>(Trampoline),
+           reinterpret_cast<void *>(__xray_FunctionExit),
            reinterpret_cast<void *>(Address));
     return false;
   }
@@ -218,16 +217,16 @@ bool patchFunctionExit(const bool Enable, const uint32_t FuncId,
 }
 
 bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
-                           const XRaySledEntry &Sled,
-                           void (*Trampoline)()) XRAY_NEVER_INSTRUMENT {
+                           const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
   // Here we do the dance of replacing the tail call sled with a similar
   // sequence as the entry sled, but calls the tail exit sled instead.
   const uint64_t Address = Sled.address();
-  int64_t TrampolineOffset = reinterpret_cast<int64_t>(Trampoline) -
-                             (static_cast<int64_t>(Address) + 11);
+  int64_t TrampolineOffset =
+      reinterpret_cast<int64_t>(__xray_FunctionTailExit) -
+      (static_cast<int64_t>(Address) + 11);
   if (TrampolineOffset < MinOffset || TrampolineOffset > MaxOffset) {
     Report("XRay Tail Exit trampoline (%p) too far from sled (%p)\n",
-           reinterpret_cast<void *>(Trampoline),
+           reinterpret_cast<void *>(__xray_FunctionTailExit),
            reinterpret_cast<void *>(Address));
     return false;
   }
@@ -248,8 +247,7 @@ bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
 }
 
 bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
-                      const XRaySledEntry &Sled,
-                      void (*Trampoline)()) XRAY_NEVER_INSTRUMENT {
+                      const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
   // Here we do the dance of replacing the following sled:
   //
   // xray_sled_n:
@@ -277,8 +275,7 @@ bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
 }
 
 bool patchTypedEvent(const bool Enable, const uint32_t FuncId,
-                     const XRaySledEntry &Sled,
-                     void (*Trampoline)()) XRAY_NEVER_INSTRUMENT {
+                      const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
   // Here we do the dance of replacing the following sled:
   //
   // xray_sled_n:
diff --git a/compiler-rt/test/xray/TestCases/Posix/basic-mode-dso.cpp b/compiler-rt/test/xray/TestCases/Posix/basic-mode-dso.cpp
deleted file mode 100644
index 31c615bd1f81bf..00000000000000
--- a/compiler-rt/test/xray/TestCases/Posix/basic-mode-dso.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-// Testing shared library support in basic logging mode.
-
-// RUN: split-file %s %t
-// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlib.cpp -o %t/testlib.so
-// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -std=c++11 %t/main.cpp %t/testlib.so -Wl,-rpath,%t -o %t/main.o
-
-// RUN: XRAY_OPTIONS="patch_premain=false,xray_mode=xray-basic,xray_logfile_base=basic-mode-dso-,verbosity=1" XRAY_BASIC_OPTIONS="func_duration_threshold_us=0" %run %t/main.o 2>&1 | FileCheck %s
-// RUN: %llvm_xray account --format=csv --sort=funcid "`ls basic-mode-dso-* | head -1`" | FileCheck --check-prefix=ACCOUNT %s
-// RUN: rm basic-mode-dso-*
-
-// REQUIRES: target=x86_64{{.*}}
-
-//--- main.cpp
-
-#include "xray/xray_interface.h"
-
-#include <cstdio>
-#include <unistd.h>
-
-[[clang::xray_always_instrument]] void instrumented_in_executable() {
-  printf("instrumented_in_executable called\n");
-  sleep(1);
-}
-
-extern void instrumented_in_dso();
-
-int main() {
-  // Explicit patching to ensure the DSO has been loaded
-  __xray_patch();
-  instrumented_in_executable();
-  // CHECK: instrumented_in_executable called
-  instrumented_in_dso();
-  // CHECK-NEXT: instrumented_in_dso called
-}
-
-//--- testlib.cpp
-
-#include <cstdio>
-#include <unistd.h>
-
-[[clang::xray_always_instrument]] void instrumented_in_dso() {
-  printf("instrumented_in_dso called\n");
-}
-
-// ACCOUNT: funcid,count,min,median,90%ile,99%ile,max,sum,debug,function
-// ACCOUNT-NEXT: 1,1,{{.*}}
-// ACCOUNT-NEXT: 16777217,1,{{.*}}
diff --git a/compiler-rt/test/xray/TestCases/Posix/clang-xray-shared.cpp b/compiler-rt/test/xray/TestCases/Posix/clang-xray-shared.cpp
deleted file mode 100644
index 92f3c29e970d42..00000000000000
--- a/compiler-rt/test/xray/TestCases/Posix/clang-xray-shared.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-// Test that the DSO-local runtime library has been linked if -fxray-shared is passed.
-//
-// RUN: %clangxx -fxray-instrument -fxray-shared %s -shared -o %t.so
-// RUN: llvm-nm %t.so | FileCheck %s --check-prefix ENABLED
-
-// RUN: %clangxx -fxray-instrument %s -shared -o %t.so
-// RUN: llvm-nm %t.so | FileCheck %s --check-prefix DISABLED
-//
-// REQUIRES: target=x86_64{{.*}}
-
-[[clang::xray_always_instrument]] int always_instrumented() { return 42; }
-
-// ENABLED: __start_xray_instr_map
-// DISABLED-NOT: __start_xray_instr_map
diff --git a/compiler-rt/test/xray/TestCases/Posix/dlopen.cpp b/compiler-rt/test/xray/TestCases/Posix/dlopen.cpp
deleted file mode 100644
index 9db411d5ff1c6e..00000000000000
--- a/compiler-rt/test/xray/TestCases/Posix/dlopen.cpp
+++ /dev/null
@@ -1,107 +0,0 @@
-// Check that we can patch and un-patch DSOs loaded with dlopen.
-//
-
-// RUN: split-file %s %t
-// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlib.cpp -o %t/testlib.so
-// RUN: %clangxx_xray -g -fPIC -rdynamic -fxray-instrument -fxray-shared -std=c++11 %t/main.cpp -o %t/main.o
-//
-// RUN: XRAY_OPTIONS="patch_premain=true" %run %t/main.o %t/testlib.so 2>&1 | FileCheck %s
-
-// REQUIRES: target=x86_64{{.*}}
-
-//--- main.cpp
-
-#include "xray/xray_interface.h"
-
-#include <cstdio>
-#include <dlfcn.h>
-
-void test_handler(int32_t fid, XRayEntryType type) {
-  printf("called: %d, type=%d\n", fid, static_cast<int32_t>(type));
-}
-
-[[clang::xray_always_instrument]] void instrumented_in_executable() {
-  printf("instrumented_in_executable called\n");
-}
-
-typedef void (*dso_func_type)();
-
-int main(int argc, char **argv) {
-  if (argc < 2) {
-    printf("Shared library argument missing\n");
-    // CHECK-NOT: Shared library argument missing
-    return 1;
-  }
-
-  const char *dso_path = argv[1];
-
-  void *dso_handle = dlopen(dso_path, RTLD_LAZY);
-  if (!dso_handle) {
-    printf("Failed to load shared library\n");
-    char *error = dlerror();
-    if (error) {
-      fprintf(stderr, "%s\n", error);
-      return 1;
-    }
-    return 1;
-  }
-
-  dso_func_type instrumented_in_dso =
-      (dso_func_type)dlsym(dso_handle, "_Z19instrumented_in_dsov");
-  if (!instrumented_in_dso) {
-    printf("Failed to find symbol\n");
-    char *error = dlerror();
-    if (error) {
-      fprintf(stderr, "%s\n", error);
-      return 1;
-    }
-    return 1;
-  }
-
-  __xray_set_handler(test_handler);
-
-  instrumented_in_executable();
-  // CHECK: called: {{.*}}, type=0
-  // CHECK-NEXT: instrumented_in_executable called
-  // CHECK-NEXT: called: {{.*}}, type=1
-  instrumented_in_dso();
-  // CHECK-NEXT: called: {{.*}}, type=0
-  // CHECK-NEXT: instrumented_in_dso called
-  // CHECK-NEXT: called: {{.*}}, type=1
-
-  auto status = __xray_unpatch();
-  printf("unpatching status: %d\n", static_cast<int32_t>(status));
-  // CHECK-NEXT: unpatching status: 1
-
-  instrumented_in_executable();
-  // CHECK-NEXT: instrumented_in_executable called
-  instrumented_in_dso();
-  // CHECK-NEXT: instrumented_in_dso called
-
-  status = __xray_patch();
-  printf("patching status: %d\n", static_cast<int32_t>(status));
-  // CHECK-NEXT: patching status: 1
-
-  instrumented_in_executable();
-  // CHECK-NEXT: called: {{.*}}, type=0
-  // CHECK-NEXT: instrumented_in_executable called
-  // CHECK-NEXT: called: {{.*}}, type=1
-  instrumented_in_dso();
-  // CHECK-NEXT: called: {{.*}}, type=0
-  // CHECK-NEXT: instrumented_in_dso called
-  // CHECK-NEXT: called: {{.*}}, type=1
-
-  dlclose(dso_handle);
-
-  status = __xray_unpatch();
-  printf("unpatching status: %d\n", static_cast<int32_t>(status));
-  // CHECK-NEXT: unpatching status: 1
-}
-
-//--- testlib.cpp
-
-#include <cstdio>
-
-[[clang::xray_always_instrument]] void instrumented_in_dso() {
-  printf("instrumented_in_dso called\n");
-}
diff --git a/compiler-rt/test/xray/TestCases/Posix/dso-dep-chains.cpp b/compiler-rt/test/xray/TestCases/Posix/dso-dep-chains.cpp
deleted file mode 100644
index 89da2764c35cee..00000000000000
--- a/compiler-rt/test/xray/TestCases/Posix/dso-dep-chains.cpp
+++ /dev/null
@@ -1,197 +0,0 @@
-// Check that loading libraries with different modes (RTLD_LOCAL/RTLD_GLOBAL)
-// and dependencies on other DSOs work correctly.
-//
-
-// RUN: split-file %s %t
-//
-// Build shared libs with dependencies b->c and e->f
-// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testliba.cpp -o %t/testliba.so
-// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlibc.cpp -o %t/testlibc.so
-// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlibb.cpp %t/testlibc.so -o %t/testlibb.so
-// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlibd.cpp -o %t/testlibd.so
-// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlibf.cpp -o %t/testlibf.so
-// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlibe.cpp %t/testlibf.so -o %t/testlibe.so
-//
-// Executable links with a and b explicitly and loads d and e at runtime.
-// RUN: %clangxx_xray -g -fPIC -rdynamic -fxray-instrument -fxray-shared -std=c++11 %t/main.cpp %t/testliba.so %t/testlibb.so -o %t/main.o
-//
-// RUN:  XRAY_OPTIONS="patch_premain=true" %run %t/main.o %t/testlibd.so %t/testlibe.so 2>&1 | FileCheck %s
-
-// REQUIRES: target=x86_64{{.*}}
-
-//--- main.cpp
-
-#include "xray/xray_interface.h"
-
-#include <cstdio>
-#include <dlfcn.h>
-
-[[clang::xray_never_instrument]] void test_handler(int32_t fid,
-                                                   XRayEntryType type) {
-  printf("called: %d, object=%d, fn=%d, type=%d\n", fid, (fid >> 24) & 0xFF,
-         fid & 0x00FFFFFF, static_cast<int32_t>(type));
-}
-
-[[clang::xray_always_instrument]] void instrumented_in_executable() {
-  printf("instrumented_in_executable called\n");
-}
-
-typedef void (*dso_func_type)();
-
-[[clang::xray_never_instrument]] void *load_dso(const char *path, int mode) {
-  void *dso_handle = dlopen(path, mode);
-  if (!dso_handle) {
-    printf("failed to load shared library\n");
-    char *error = dlerror();
-    if (error) {
-      fprintf(stderr, "%s\n", error);
-    }
-    return nullptr;
-  }
-  return dso_handle;
-}
-
-[[clang::xray_never_instrument]] void find_and_call(void *dso_handle,
-                                                    const char *fn) {
-  dso_func_type dso_fn = (dso_func_type)dlsym(dso_handle, fn);
-  if (!dso_fn) {
-    printf("failed to find symbol\n");
-    char *error = dlerror();
-    if (error) {
-      fprintf(stderr, "%s\n", error);
-    }
-    return;
-  }
-  dso_fn();
-}
-
-extern void a();
-extern void b();
-
-int main(int argc, char **argv) {
-
-  if (argc < 3) {
-    printf("Shared library arguments missing\n");
-    // CHECK-NOT: Shared library arguments missing
-    return 1;
-  }
-
-  const char *dso_path_d = argv[1];
-  const char *dso_path_e = argv[2];
-
-  __xray_set_handler(test_handler);
-
-  instrumented_in_executable();
-  // CHECK: called: {{[0-9]+}}, object=0, fn={{[0-9]+}}, type=0
-  // CHECK-NEXT: instrumented_in_executable called
-  // CHECK-NEXT: called: {{[0-9]+}}, object=0, fn={{[0-9]+}}, type=1
-
-  a();
-  // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ1:[0-9]+]], fn=1, type=0
-  // CHECK-NEXT: a called
-  // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ1]], fn=1, type=1
-
-  // Make sure this object ID does not appear again
-  // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ1]]
-
-  b(); // b calls c
-  // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ2:[0-9]+]], fn=1, type=0
-  // CHECK-NEXT: b called
-  // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ3:[0-9]+]], fn=1, type=0
-  // CHECK-NEXT: c called
-  // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ3]], fn=1, type=1
-  // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ3]]
-  // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ2]], fn=1, type=1
-  // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ2]]
-
-  // Now check explicit loading with RTLD_LOCAL
-
-  void *dso_handle_d = load_dso(dso_path_d, RTLD_LAZY | RTLD_LOCAL);
-  void *dso_handle_e = load_dso(dso_path_e, RTLD_LAZY | RTLD_LOCAL);
-  // CHECK-NOT: failed to load shared library
-
-  find_and_call(dso_handle_d, "_Z1dv");
-  // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ4:[0-9]+]], fn=1, type=0
-  // CHECK-NEXT: d called
-  // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ4]], fn=1, type=1
-  // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ4]]
-
-  find_and_call(dso_handle_e, "_Z1ev");
-  // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ5:[0-9]+]], fn=1, type=0
-  // CHECK-NEXT: e called
-  // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ6:[0-9]+]], fn=1, type=0
-  // CHECK-NEXT: f called
-  // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ6]], fn=1, type=1
-  // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ6]]
-  // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ5]], fn=1, type=1
-  // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ5]]
-
-  // Unload DSOs
-  dlclose(dso_handle_d);
-  dlclose(dso_handle_e);
-
-  // Repeat test with RTLD_GLOBAL
-  dso_handle_d = load_dso(dso_path_d, RTLD_LAZY | RTLD_GLOBAL);
-  dso_handle_e = load_dso(dso_path_e, RTLD_LAZY | RTLD_GLOBAL);
-  // CHECK-NOT: failed to load shared library
-
-  find_and_call(dso_handle_d, "_Z1dv");
-  // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ7:[0-9]+]], fn=1, type=0
-  // CHECK-NEXT: d called
-  // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ7]], fn=1, type=1
-  // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ7]]
-
-  find_and_call(dso_handle_e, "_Z1ev");
-  // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ8:[0-9]+]], fn=1, type=0
-  // CHECK-NEXT: e called
-  // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ9:[0-9]+]], fn=1, type=0
-  // CHECK-NEXT: f called
-  // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ9]], fn=1, type=1
-  // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ9]]
-  // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ8]], fn=1, type=1
-  // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ8]]
-
-  auto status = __xray_unpatch();
-  printf("unpatching status: %d\n", static_cast<int32_t>(status));
-  // CHECK-NEXT: unpatching status: 1
-
-  dlclose(dso_handle_d);
-  dlclose(dso_handle_e);
-}
-
-//--- libgenmacro.inc
-#include <cstdio>
-// Helper macros to quickly generate libraries containing a single function.
-#define GENERATE_LIB(NAME)                                                     \
-  [[clang::xray_always_instrument]] void NAME() { printf(#NAME " called\n"); }
-
-#define GENERATE_LIB_WITH_CALL(NAME, FN)                                       \
-  extern void FN();                                                            \
-  [[clang::xray_always_instrument]] void NAME() {                              \
-    printf(#NAME " called\n");                                                 \
-    FN();                                                                      \
-  }
-
-//--- testliba.cpp
-#include "libgenmacro.inc"
-GENERATE_LIB(a)
-
-//--- testlibb.cpp
-#include "libgenmacro.inc"
-GENERATE_LIB_WITH_CALL(b, c)
-
-//--- testlibc.cpp
-#include "libgenmacro.inc"
-GENERATE_LIB(c)
-
-//--- testlibd.cpp
-#include "libgenmacro.inc"
-GENERATE_LIB(d)
-
-//--- testlibe.cpp
-#include "libgenmacro.inc"
-GENERATE_LIB_WITH_CALL(e, f)
-
-//--- testlibf.cpp
-#include "libgenmacro.inc"
-GENERATE_LIB(f)
diff --git a/compiler-rt/test/xray/TestCases/Posix/patch-premain-dso.cpp b/compiler-rt/test/xray/TestCases/Posix/patch-premain-dso.cpp
deleted file mode 100644
index 0708d0383439d0..00000000000000
--- a/compiler-rt/test/xray/TestCases/Posix/patch-premain-dso.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-// Checking that DSOs are automatically patched upon load, if patch_premain is passed.
-
-// RUN: split-file %s %t
-// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlib.cpp -o %t/testlib.so
-// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -std=c++11 %t/main.cpp %t/testlib.so -Wl,-rpath,%t -o %t/main.o
-
-// RUN: XRAY_OPTIONS="patch_premain=true,verbosity=1" %run %t/main.o 2>&1 | FileCheck %s
-
-// REQUIRES: target=x86_64{{.*}}
-
-//--- main.cpp
-
-#include "xray/xray_interface.h"
-
-#include <cstdio>
-
-void test_handler(int32_t fid, XRayEntryType type) {
-  printf("called: %d, type=%d\n", fid, static_cast<int32_t>(type));
-}
-
-[[clang::xray_always_instrument]] void instrumented_in_executable() {
-  printf("instrumented_in_executable called\n");
-}
-
-extern void instrumented_in_dso();
-
-int main() {
-  __xray_set_handler(test_handler);
-  instrumented_in_executable();
-  // CHECK: called: {{.*}}, type=0
-  // CHECK-NEXT: instrumented_in_executable called
-  // CHECK-NEXT: called: {{.*}}, type=1
-  instrumented_in_dso();
-  // CHECK-NEXT: called: {{.*}}, type=0
-  // CHECK-NEXT: instrumented_in_dso called
-  // CHECK-NEXT: called: {{.*}}, type=1
-}
-
-//--- testlib.cpp
-
-#include <cstdio>
-
-[[clang::xray_always_instrument]] void instrumented_in_dso() {
-  printf("instrumented_in_dso called\n");
-}
diff --git a/compiler-rt/test/xray/TestCases/Posix/patching-unpatching-dso.cpp b/compiler-rt/test/xray/TestCases/Posix/patching-unpatching-dso.cpp
deleted file mode 100644
index d3e992dd497725..00000000000000
--- a/compiler-rt/test/xray/TestCases/Posix/patching-unpatching-dso.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-// Check that we can patch and un-patch on demand, and that logging gets invoked
-// appropriately.
-//
-
-// RUN: split-file %s %t
-// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlib.cpp -o %t/testlib.so
-// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -std=c++11 %t/main.cpp %t/testlib.so -Wl,-rpath,%t -o %t/main.o
-
-// RUN: XRAY_OPTIONS="patch_premain=false" %run %t/main.o 2>&1 | FileCheck %s
-
-// REQUIRES: target=x86_64{{.*}}
-
-//--- main.cpp
-
-#include "xray/xray_interface.h"
-
-#include <cstdio>
-
-bool called = false;
-
-void test_handler(int32_t fid, XRayEntryType type) {
-  printf("called: %d, type=%d\n", fid, static_cast<int32_t>(type));
-  called = true;
-}
-
-[[clang::xray_always_instrument]] void instrumented_in_executable() {
-  printf("instrumented_in_executable called\n");
-}
-
-extern void instrumented_in_dso();
-
-int main() {
-  __xray_set_handler(test_handler);
-  instrumented_in_executable();
-  // CHECK: instrumented_in_executable called
-  instrumented_in_dso();
-  // CHECK: instrumented_in_dso called
-  auto status = __xray_patch();
-  printf("patching status: %d\n", static_cast<int32_t>(status));
-  // CHECK-NEXT: patching status: 1
-  instrumented_in_executable();
-  // CHECK-NEXT: called: {{.*}}, type=0
-  // CHECK-NEXT: instrumented_in_executable called
-  // CHECK-NEXT: called: {{.*}}, type=1
-  instrumented_in_dso();
-  // CHECK-NEXT: called: {{.*}}, type=0
-  // CHECK-NEXT: instrumented_in_dso called
-  // CHECK-NEXT: called: {{.*}}, type=1
-  status = __xray_unpatch();
-  printf("patching status: %d\n", static_cast<int32_t>(status));
-  // CHECK-NEXT: patching status: 1
-  instrumented_in_executable();
-  // CHECK-NEXT: instrumented_in_executable called
-  instrumented_in_dso();
-  // CHECK-NEXT: instrumented_in_dso called
-  status = __xray_patch();
-  printf("patching status: %d\n", static_cast<int32_t>(status));
-  // CHECK-NEXT: patching status: 1
-  __xray_remove_handler();
-  instrumented_in_executable();
-  // CHECK-NEXT: instrumented_in_executable called
-  instrumented_in_dso();
-  // CHECK-NEXT: instrumented_in_dso called
-  status = __xray_unpatch();
-  printf("patching status: %d\n", static_cast<int32_t>(status));
-  // CHECK-NEXT: patching status: 1
-}
-
-//--- testlib.cpp
-
-#include <cstdio>
-
-[[clang::xray_always_instrument]] void instrumented_in_dso() {
-  printf("instrumented_in_dso called\n");
-}

From 14705a912f6296700cef4d2aa7eb100f71dfbd0a Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 11 Oct 2024 16:16:12 +0400
Subject: [PATCH 158/177] CodeGen: Remove redundant REQUIRES registered-target
 from tests (#111982)

These are already in target specific test directories.
---
 llvm/test/CodeGen/AArch64/statepoint-twoaddr.mir | 1 -
 llvm/test/CodeGen/AMDGPU/expand-variadic-call.ll | 1 -
 llvm/test/CodeGen/X86/tls-align.ll               | 1 -
 3 files changed, 3 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/statepoint-twoaddr.mir b/llvm/test/CodeGen/AArch64/statepoint-twoaddr.mir
index c1ddc9c14d814b..51e9ed6fef2d3a 100644
--- a/llvm/test/CodeGen/AArch64/statepoint-twoaddr.mir
+++ b/llvm/test/CodeGen/AArch64/statepoint-twoaddr.mir
@@ -1,7 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=aarch64-unknown-linux -run-pass=twoaddressinstruction -verify-machineinstrs %s -o - | FileCheck %s
 # RUN: llc -mtriple=aarch64-unknown-linux --passes=two-address-instruction -verify-each %s -o - | FileCheck %s
-# REQUIRES: aarch64-registered-target
 
 # Verify that the register class is correctly constrained after the twoaddress replacement
 ---
diff --git a/llvm/test/CodeGen/AMDGPU/expand-variadic-call.ll b/llvm/test/CodeGen/AMDGPU/expand-variadic-call.ll
index d0fd6685df3d73..cca70005b4cdc1 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-variadic-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-variadic-call.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --function-signature
 ; RUN: opt -S --passes=expand-variadics --expand-variadics-override=lowering < %s | FileCheck %s
-; REQUIRES: amdgpu-registered-target
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
 target triple = "amdgcn-amd-amdhsa"
 
diff --git a/llvm/test/CodeGen/X86/tls-align.ll b/llvm/test/CodeGen/X86/tls-align.ll
index e996c00dbf1d4a..94f9b9045cf24c 100644
--- a/llvm/test/CodeGen/X86/tls-align.ll
+++ b/llvm/test/CodeGen/X86/tls-align.ll
@@ -1,4 +1,3 @@
-; REQUIRES: x86-registered-target
 ; RUN: opt -passes=instcombine -S < %s | FileCheck %s
 
 %class.Arr = type <{ [160 x %class.Derived], i32, [4 x i8] }>

From 900ea21ffb38ba5b783b20f394c43c6c89d58086 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Fri, 11 Oct 2024 05:25:12 -0700
Subject: [PATCH 159/177] [NFC][CodingStandard] Add additional example for
 if-else brace rule (#111733)

Add example to document that single statement `else` needs a brace if
the associated `if` needs a brace.
---
 llvm/docs/CodingStandards.rst | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/llvm/docs/CodingStandards.rst b/llvm/docs/CodingStandards.rst
index 63df5af2523db6..87bbb3d127ad51 100644
--- a/llvm/docs/CodingStandards.rst
+++ b/llvm/docs/CodingStandards.rst
@@ -1713,6 +1713,14 @@ would help to avoid running into a "dangling else" situation.
     handleOtherDecl(D);
   }
 
+  // Use braces for the `else` block to keep it uniform with the `if` block.
+  if (isa<FunctionDecl>(D)) {
+    verifyFunctionDecl(D);
+    handleFunctionDecl(D);
+  } else {
+    handleOtherDecl(D);
+  }
+
   // This should also omit braces.  The `for` loop contains only a single
   // statement, so it shouldn't have braces.  The `if` also only contains a
   // single simple statement (the `for` loop), so it also should omit braces.

From fa789dffb1e12c2aece0187aeacc48dfb1768340 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Fri, 11 Oct 2024 05:26:03 -0700
Subject: [PATCH 160/177] [NFC] Rename `Intrinsic::getDeclaration` to
 `getOrInsertDeclaration` (#111752)

Rename the function to reflect its correct behavior and to be consistent
with `Module::getOrInsertFunction`. This is also in preparation of
adding a new `Intrinsic::getDeclaration` that will have behavior similar
to `Module::getFunction` (i.e, just lookup, no creation).
---
 clang/lib/CodeGen/CGBuiltin.cpp               |  11 +-
 clang/lib/CodeGen/CGDecl.cpp                  |   8 +-
 clang/lib/CodeGen/CGException.cpp             |   4 +-
 clang/lib/CodeGen/CodeGenFunction.cpp         |   4 +-
 clang/lib/CodeGen/CodeGenModule.cpp           |   4 +-
 clang/lib/CodeGen/Targets/SystemZ.cpp         |   4 +-
 llvm/examples/BrainF/BrainF.cpp               |   4 +-
 llvm/include/llvm-c/Core.h                    |   4 +-
 llvm/include/llvm/IR/IntrinsicInst.h          |   6 +-
 llvm/include/llvm/IR/Intrinsics.h             |   9 +-
 llvm/include/llvm/IR/MatrixBuilder.h          |   8 +-
 llvm/lib/AsmParser/LLParser.cpp               |   2 +-
 llvm/lib/CodeGen/ExpandLargeFpConvert.cpp     |   2 +-
 llvm/lib/CodeGen/ExpandMemCmp.cpp             |   2 +-
 llvm/lib/CodeGen/ExpandVectorPredication.cpp  |  14 +-
 llvm/lib/CodeGen/HardwareLoops.cpp            |  12 +-
 llvm/lib/CodeGen/IntrinsicLowering.cpp        |   2 +-
 llvm/lib/CodeGen/SafeStack.cpp                |   3 +-
 llvm/lib/CodeGen/SjLjEHPrepare.cpp            |  22 +-
 llvm/lib/CodeGen/StackProtector.cpp           |   5 +-
 llvm/lib/CodeGen/WasmEHPrepare.cpp            |  15 +-
 llvm/lib/IR/AutoUpgrade.cpp                   | 318 +++++++++---------
 llvm/lib/IR/Core.cpp                          |   2 +-
 llvm/lib/IR/DIBuilder.cpp                     |   8 +-
 llvm/lib/IR/DebugProgramInstruction.cpp       |   8 +-
 llvm/lib/IR/IRBuilder.cpp                     |  96 +++---
 llvm/lib/IR/IntrinsicInst.cpp                 |  29 +-
 llvm/lib/IR/Intrinsics.cpp                    |   5 +-
 llvm/lib/IR/Module.cpp                        |   9 +-
 llvm/lib/IR/VectorBuilder.cpp                 |   4 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    |  29 +-
 .../Target/AArch64/AArch64StackTagging.cpp    |  18 +-
 .../AArch64/AArch64TargetTransformInfo.cpp    |   2 +-
 llvm/lib/Target/AArch64/SMEABIPass.cpp        |  14 +-
 .../Target/AMDGPU/AMDGPUAtomicOptimizer.cpp   |  24 +-
 .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp    |  15 +-
 .../AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp  |   2 +-
 .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp     |  17 +-
 .../AMDGPU/AMDGPUInstructionSelector.cpp      |   4 +-
 llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp     |  13 +-
 .../AMDGPU/AMDGPULowerModuleLDSPass.cpp       |   8 +-
 .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp |  14 +-
 llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp   |  11 +-
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      |   4 +-
 .../AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp  |   4 +-
 .../Target/AMDGPU/SIAnnotateControlFlow.cpp   |  16 +-
 llvm/lib/Target/ARM/ARMISelLowering.cpp       |  25 +-
 llvm/lib/Target/ARM/ARMParallelDSP.cpp        |  13 +-
 llvm/lib/Target/ARM/MVETailPredication.cpp    |   2 +-
 .../Target/BPF/BPFAbstractMemberAccess.cpp    |   2 +-
 llvm/lib/Target/BPF/BPFAdjustOpt.cpp          |   2 +-
 .../Target/BPF/BPFPreserveStaticOffset.cpp    |   2 +-
 llvm/lib/Target/DirectX/DXILOpLowering.cpp    |   4 +-
 llvm/lib/Target/Hexagon/HexagonGenExtract.cpp |   2 +-
 .../Target/Hexagon/HexagonISelLowering.cpp    |   4 +-
 .../Hexagon/HexagonLoopIdiomRecognition.cpp   |   3 +-
 .../Target/Hexagon/HexagonVectorCombine.cpp   |  11 +-
 .../LoongArch/LoongArchISelLowering.cpp       |   4 +-
 llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp      |   2 +-
 .../Target/NVPTX/NVPTXTargetTransformInfo.cpp |   3 +-
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   |   8 +-
 .../Target/PowerPC/PPCLowerMASSVEntries.cpp   |   2 +-
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  30 +-
 .../Target/SPIRV/SPIRVPrepareFunctions.cpp    |   8 +-
 llvm/lib/Target/SystemZ/SystemZTDC.cpp        |   4 +-
 .../WebAssemblyLowerEmscriptenEHSjLj.cpp      |   2 +-
 .../WebAssemblyLowerRefTypesIntPtrConv.cpp    |   2 +-
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  10 +-
 .../Target/X86/X86InstCombineIntrinsic.cpp    |   6 +-
 llvm/lib/Target/X86/X86PartialReduction.cpp   |   2 +-
 llvm/lib/Target/X86/X86WinEHState.cpp         |  16 +-
 .../Target/XCore/XCoreLowerThreadLocal.cpp    |   4 +-
 .../AggressiveInstCombine.cpp                 |  11 +-
 llvm/lib/Transforms/Coroutines/Coroutines.cpp |   5 +-
 llvm/lib/Transforms/IPO/CrossDSOCFI.cpp       |   3 +-
 .../lib/Transforms/IPO/SampleProfileProbe.cpp |   2 +-
 .../lib/Transforms/IPO/WholeProgramDevirt.cpp |  12 +-
 .../InstCombine/InstCombineAddSub.cpp         |   7 +-
 .../InstCombine/InstCombineAndOrXor.cpp       |  14 +-
 .../InstCombine/InstCombineCalls.cpp          |  29 +-
 .../InstCombine/InstCombineCasts.cpp          |  15 +-
 .../InstCombine/InstCombineCompares.cpp       |  18 +-
 .../InstCombine/InstCombineSelect.cpp         |  19 +-
 .../InstCombineSimplifyDemanded.cpp           |   2 +-
 .../InstCombine/InstCombineVectorOps.cpp      |   8 +-
 .../InstCombine/InstructionCombining.cpp      |   6 +-
 .../Instrumentation/AddressSanitizer.cpp      |   4 +-
 .../Instrumentation/BoundsChecking.cpp        |   2 +-
 .../Instrumentation/HWAddressSanitizer.cpp    |   4 +-
 llvm/lib/Transforms/Instrumentation/KCFI.cpp  |   3 +-
 .../Instrumentation/MemorySanitizer.cpp       |   6 +-
 .../Instrumentation/PGOInstrumentation.cpp    |  23 +-
 .../Instrumentation/SanitizerCoverage.cpp     |   2 +-
 .../Instrumentation/ThreadSanitizer.cpp       |   7 +-
 .../ObjCARC/ARCRuntimeEntryPoints.h           |   2 +-
 .../Transforms/Scalar/InferAddressSpaces.cpp  |  14 +-
 .../Transforms/Scalar/LoopDataPrefetch.cpp    |   2 +-
 llvm/lib/Transforms/Scalar/LoopFlatten.cpp    |   4 +-
 .../Transforms/Scalar/LoopIdiomRecognize.cpp  |   4 +-
 .../Transforms/Scalar/LowerGuardIntrinsic.cpp |   2 +-
 .../Scalar/LowerMatrixIntrinsics.cpp          |   2 +-
 .../Transforms/Scalar/MakeGuardsExplicit.cpp  |   2 +-
 .../lib/Transforms/Scalar/MemCpyOptimizer.cpp |   4 +-
 .../Scalar/RewriteStatepointsForGC.cpp        |   4 +-
 llvm/lib/Transforms/Scalar/Scalarizer.cpp     |   5 +-
 .../Transforms/Utils/AssumeBundleBuilder.cpp  |   3 +-
 llvm/lib/Transforms/Utils/CloneFunction.cpp   |   4 +-
 llvm/lib/Transforms/Utils/CodeExtractor.cpp   |   3 +-
 .../Utils/EntryExitInstrumenter.cpp           |   2 +-
 llvm/lib/Transforms/Utils/InlineFunction.cpp  |   7 +-
 llvm/lib/Transforms/Utils/IntegerDivision.cpp |   4 +-
 llvm/lib/Transforms/Utils/Local.cpp           |   3 +-
 .../lib/Transforms/Utils/LowerGlobalDtors.cpp |   4 +-
 .../Transforms/Utils/MemoryTaggingSupport.cpp |   6 +-
 llvm/lib/Transforms/Utils/PredicateInfo.cpp   |   4 +-
 .../Utils/PromoteMemoryToRegister.cpp         |   2 +-
 .../Utils/RelLookupTableConverter.cpp         |   2 +-
 .../Utils/ScalarEvolutionExpander.cpp         |   4 +-
 .../lib/Transforms/Utils/SimplifyLibCalls.cpp |   2 +-
 .../Transforms/Vectorize/SLPVectorizer.cpp    |   2 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |   2 +-
 .../llvm-reduce/deltas/ReduceOpcodes.cpp      |   2 +-
 .../Analysis/AssumeBundleQueriesTest.cpp      |   3 +-
 llvm/unittests/Analysis/MemorySSATest.cpp     |   2 +-
 llvm/unittests/Analysis/ValueTrackingTest.cpp |   4 +-
 llvm/unittests/IR/BasicBlockTest.cpp          |   8 +-
 llvm/unittests/IR/DebugInfoTest.cpp           |   3 +-
 llvm/unittests/IR/IRBuilderTest.cpp           |   5 +-
 llvm/unittests/IR/IntrinsicsTest.cpp          |   2 +-
 llvm/unittests/IR/PatternMatch.cpp            |   2 +-
 llvm/unittests/IR/VPIntrinsicTest.cpp         |   4 +-
 .../Transforms/Vectorize/VPlanTest.cpp        |   3 +-
 .../mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td   |   6 +-
 .../LLVMIR/LLVMToLLVMIRTranslation.cpp        |   5 +-
 mlir/lib/Target/LLVMIR/ModuleTranslation.cpp  |   7 +-
 polly/lib/CodeGen/IslExprBuilder.cpp          |  12 +-
 polly/lib/CodeGen/PerfMonitor.cpp             |   2 +-
 137 files changed, 721 insertions(+), 642 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index ff678ee04f9c2a..059c75fae284dd 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -13648,7 +13648,7 @@ Value *CodeGenFunction::EmitBPFBuiltinExpr(unsigned BuiltinID,
     Value *InfoKind = ConstantInt::get(Int64Ty, C->getSExtValue());
 
     // Built the IR for the preserve_field_info intrinsic.
-    llvm::Function *FnGetFieldInfo = llvm::Intrinsic::getDeclaration(
+    llvm::Function *FnGetFieldInfo = llvm::Intrinsic::getOrInsertDeclaration(
         &CGM.getModule(), llvm::Intrinsic::bpf_preserve_field_info,
         {FieldAddr->getType()});
     return Builder.CreateCall(FnGetFieldInfo, {FieldAddr, InfoKind});
@@ -13670,10 +13670,10 @@ Value *CodeGenFunction::EmitBPFBuiltinExpr(unsigned BuiltinID,
 
     llvm::Function *FnDecl;
     if (BuiltinID == BPF::BI__builtin_btf_type_id)
-      FnDecl = llvm::Intrinsic::getDeclaration(
+      FnDecl = llvm::Intrinsic::getOrInsertDeclaration(
           &CGM.getModule(), llvm::Intrinsic::bpf_btf_type_id, {});
     else
-      FnDecl = llvm::Intrinsic::getDeclaration(
+      FnDecl = llvm::Intrinsic::getOrInsertDeclaration(
           &CGM.getModule(), llvm::Intrinsic::bpf_preserve_type_info, {});
     CallInst *Fn = Builder.CreateCall(FnDecl, {SeqNumVal, FlagValue});
     Fn->setMetadata(LLVMContext::MD_preserve_access_index, DbgInfo);
@@ -13708,7 +13708,7 @@ Value *CodeGenFunction::EmitBPFBuiltinExpr(unsigned BuiltinID,
     Value *FlagValue = ConstantInt::get(Int64Ty, Flag->getSExtValue());
     Value *SeqNumVal = ConstantInt::get(Int32Ty, BuiltinSeqNum++);
 
-    llvm::Function *IntrinsicFn = llvm::Intrinsic::getDeclaration(
+    llvm::Function *IntrinsicFn = llvm::Intrinsic::getOrInsertDeclaration(
         &CGM.getModule(), llvm::Intrinsic::bpf_preserve_enum_value, {});
     CallInst *Fn =
         Builder.CreateCall(IntrinsicFn, {SeqNumVal, EnumStrVal, FlagValue});
@@ -18895,7 +18895,8 @@ case Builtin::BI__builtin_hlsl_elementwise_isinf: {
   }
   case Builtin::BI__builtin_hlsl_wave_is_first_lane: {
     Intrinsic::ID ID = CGM.getHLSLRuntime().getWaveIsFirstLaneIntrinsic();
-    return EmitRuntimeCall(Intrinsic::getDeclaration(&CGM.getModule(), ID));
+    return EmitRuntimeCall(
+        Intrinsic::getOrInsertDeclaration(&CGM.getModule(), ID));
   }
   case Builtin::BI__builtin_hlsl_elementwise_sign: {
     auto *Arg0 = E->getArg(0);
diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
index 563f728e29d781..30af9268b30e2e 100644
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -2509,8 +2509,8 @@ void CodeGenFunction::pushRegularPartialArrayCleanup(llvm::Value *arrayBegin,
 llvm::Function *CodeGenModule::getLLVMLifetimeStartFn() {
   if (LifetimeStartFn)
     return LifetimeStartFn;
-  LifetimeStartFn = llvm::Intrinsic::getDeclaration(&getModule(),
-    llvm::Intrinsic::lifetime_start, AllocaInt8PtrTy);
+  LifetimeStartFn = llvm::Intrinsic::getOrInsertDeclaration(
+      &getModule(), llvm::Intrinsic::lifetime_start, AllocaInt8PtrTy);
   return LifetimeStartFn;
 }
 
@@ -2518,8 +2518,8 @@ llvm::Function *CodeGenModule::getLLVMLifetimeStartFn() {
 llvm::Function *CodeGenModule::getLLVMLifetimeEndFn() {
   if (LifetimeEndFn)
     return LifetimeEndFn;
-  LifetimeEndFn = llvm::Intrinsic::getDeclaration(&getModule(),
-    llvm::Intrinsic::lifetime_end, AllocaInt8PtrTy);
+  LifetimeEndFn = llvm::Intrinsic::getOrInsertDeclaration(
+      &getModule(), llvm::Intrinsic::lifetime_end, AllocaInt8PtrTy);
   return LifetimeEndFn;
 }
 
diff --git a/clang/lib/CodeGen/CGException.cpp b/clang/lib/CodeGen/CGException.cpp
index bb2ed237ee9f35..44a45413dbc45a 100644
--- a/clang/lib/CodeGen/CGException.cpp
+++ b/clang/lib/CodeGen/CGException.cpp
@@ -1843,7 +1843,7 @@ Address CodeGenFunction::recoverAddrOfEscapedLocal(CodeGenFunction &ParentCGF,
         std::make_pair(ParentAlloca, ParentCGF.EscapedLocals.size()));
     int FrameEscapeIdx = InsertPair.first->second;
     // call ptr @llvm.localrecover(ptr @parentFn, ptr %fp, i32 N)
-    llvm::Function *FrameRecoverFn = llvm::Intrinsic::getDeclaration(
+    llvm::Function *FrameRecoverFn = llvm::Intrinsic::getOrInsertDeclaration(
         &CGM.getModule(), llvm::Intrinsic::localrecover);
     RecoverCall = Builder.CreateCall(
         FrameRecoverFn, {ParentCGF.CurFn, ParentFP,
@@ -1942,7 +1942,7 @@ void CodeGenFunction::EmitCapturedLocals(CodeGenFunction &ParentCGF,
       // %1 = call ptr @llvm.localrecover(@"?fin$0@0@main@@",..)
       // %2 = load ptr, ptr %1, align 8
       //   ==> %2 is the frame-pointer of outermost host function
-      llvm::Function *FrameRecoverFn = llvm::Intrinsic::getDeclaration(
+      llvm::Function *FrameRecoverFn = llvm::Intrinsic::getOrInsertDeclaration(
           &CGM.getModule(), llvm::Intrinsic::localrecover);
       ParentFP = Builder.CreateCall(
           FrameRecoverFn, {ParentCGF.CurFn, ParentFP,
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index e1fd9b72b8d7b2..f3023c7a20c405 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -463,7 +463,7 @@ void CodeGenFunction::FinishFunction(SourceLocation EndLoc) {
     EscapeArgs.resize(EscapedLocals.size());
     for (auto &Pair : EscapedLocals)
       EscapeArgs[Pair.second] = Pair.first;
-    llvm::Function *FrameEscapeFn = llvm::Intrinsic::getDeclaration(
+    llvm::Function *FrameEscapeFn = llvm::Intrinsic::getOrInsertDeclaration(
         &CGM.getModule(), llvm::Intrinsic::localescape);
     CGBuilderTy(*this, AllocaInsertPt).CreateCall(FrameEscapeFn, EscapeArgs);
   }
@@ -3130,7 +3130,7 @@ void CodeGenFunction::emitAlignmentAssumptionCheck(
     llvm::Instruction *Assumption) {
   assert(isa_and_nonnull<llvm::CallInst>(Assumption) &&
          cast<llvm::CallInst>(Assumption)->getCalledOperand() ==
-             llvm::Intrinsic::getDeclaration(
+             llvm::Intrinsic::getOrInsertDeclaration(
                  Builder.GetInsertBlock()->getParent()->getParent(),
                  llvm::Intrinsic::assume) &&
          "Assumption should be a call to llvm.assume().");
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 5ba098144a74e7..7a7dea4668ad09 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -6218,8 +6218,8 @@ void CodeGenModule::emitIFuncDefinition(GlobalDecl GD) {
 
 llvm::Function *CodeGenModule::getIntrinsic(unsigned IID,
                                             ArrayRef<llvm::Type*> Tys) {
-  return llvm::Intrinsic::getDeclaration(&getModule(), (llvm::Intrinsic::ID)IID,
-                                         Tys);
+  return llvm::Intrinsic::getOrInsertDeclaration(&getModule(),
+                                                 (llvm::Intrinsic::ID)IID, Tys);
 }
 
 static llvm::StringMapEntry<llvm::GlobalVariable *> &
diff --git a/clang/lib/CodeGen/Targets/SystemZ.cpp b/clang/lib/CodeGen/Targets/SystemZ.cpp
index 56129622f48dbd..23c96fa5cf98cb 100644
--- a/clang/lib/CodeGen/Targets/SystemZ.cpp
+++ b/clang/lib/CodeGen/Targets/SystemZ.cpp
@@ -110,8 +110,8 @@ class SystemZTargetCodeGenInfo : public TargetCodeGenInfo {
     if (Ty->isFloatTy() || Ty->isDoubleTy() || Ty->isFP128Ty()) {
       llvm::Module &M = CGM.getModule();
       auto &Ctx = M.getContext();
-      llvm::Function *TDCFunc =
-          llvm::Intrinsic::getDeclaration(&M, llvm::Intrinsic::s390_tdc, Ty);
+      llvm::Function *TDCFunc = llvm::Intrinsic::getOrInsertDeclaration(
+          &M, llvm::Intrinsic::s390_tdc, Ty);
       unsigned TDCBits = 0;
       switch (BuiltinID) {
       case Builtin::BI__builtin_isnan:
diff --git a/llvm/examples/BrainF/BrainF.cpp b/llvm/examples/BrainF/BrainF.cpp
index ac01961735e137..e62cc7bd591a3f 100644
--- a/llvm/examples/BrainF/BrainF.cpp
+++ b/llvm/examples/BrainF/BrainF.cpp
@@ -67,8 +67,8 @@ void BrainF::header(LLVMContext& C) {
 
   //declare void @llvm.memset.p0i8.i32(i8 *, i8, i32, i1)
   Type *Tys[] = {PointerType::getUnqual(C), Type::getInt32Ty(C)};
-  Function *memset_func = Intrinsic::getDeclaration(module, Intrinsic::memset,
-                                                    Tys);
+  Function *memset_func =
+      Intrinsic::getOrInsertDeclaration(module, Intrinsic::memset, Tys);
 
   //declare i32 @getchar()
   getchar_func =
diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h
index 28dc270ca368d2..55649d89a6b8f4 100644
--- a/llvm/include/llvm-c/Core.h
+++ b/llvm/include/llvm-c/Core.h
@@ -2807,10 +2807,10 @@ unsigned LLVMLookupIntrinsicID(const char *Name, size_t NameLen);
 unsigned LLVMGetIntrinsicID(LLVMValueRef Fn);
 
 /**
- * Create or insert the declaration of an intrinsic.  For overloaded intrinsics,
+ * Get or insert the declaration of an intrinsic.  For overloaded intrinsics,
  * parameter types must be provided to uniquely identify an overload.
  *
- * @see llvm::Intrinsic::getDeclaration()
+ * @see llvm::Intrinsic::getOrInsertDeclaration()
  */
 LLVMValueRef LLVMGetIntrinsicDeclaration(LLVMModuleRef Mod,
                                          unsigned ID,
diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
index 4458126ffa759d..920eed01374c83 100644
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -568,9 +568,9 @@ class VPIntrinsic : public IntrinsicInst {
   /// \brief Declares a llvm.vp.* intrinsic in \p M that matches the parameters
   /// \p Params. Additionally, the load and gather intrinsics require
   /// \p ReturnType to be specified.
-  static Function *getDeclarationForParams(Module *M, Intrinsic::ID,
-                                           Type *ReturnType,
-                                           ArrayRef<Value *> Params);
+  static Function *getOrInsertDeclarationForParams(Module *M, Intrinsic::ID,
+                                                   Type *ReturnType,
+                                                   ArrayRef<Value *> Params);
 
   static std::optional<unsigned> getMaskParamPos(Intrinsic::ID IntrinsicID);
   static std::optional<unsigned> getVectorLengthParamPos(
diff --git a/llvm/include/llvm/IR/Intrinsics.h b/llvm/include/llvm/IR/Intrinsics.h
index b251036247c5c0..8c37925732a83a 100644
--- a/llvm/include/llvm/IR/Intrinsics.h
+++ b/llvm/include/llvm/IR/Intrinsics.h
@@ -87,14 +87,15 @@ namespace Intrinsic {
   /// Return the attributes for an intrinsic.
   AttributeList getAttributes(LLVMContext &C, ID id);
 
-  /// Create or insert an LLVM Function declaration for an intrinsic, and return
-  /// it.
+  /// Look up the Function declaration of the intrinsic \p id in the Module
+  /// \p M. If it does not exist, add a declaration and return it. Otherwise,
+  /// return the existing declaration.
   ///
-  /// The Tys parameter is for intrinsics with overloaded types (e.g., those
+  /// The \p Tys parameter is for intrinsics with overloaded types (e.g., those
   /// using iAny, fAny, vAny, or iPTRAny).  For a declaration of an overloaded
   /// intrinsic, Tys must provide exactly one type for each overloaded type in
   /// the intrinsic.
-  Function *getDeclaration(Module *M, ID id, ArrayRef<Type *> Tys = {});
+  Function *getOrInsertDeclaration(Module *M, ID id, ArrayRef<Type *> Tys = {});
 
   /// Looks up Name in NameTable via binary search. NameTable must be sorted
   /// and all entries must start with "llvm.".  If NameTable contains an exact
diff --git a/llvm/include/llvm/IR/MatrixBuilder.h b/llvm/include/llvm/IR/MatrixBuilder.h
index dbf2cfb7c5e966..3a04ca87f2b558 100644
--- a/llvm/include/llvm/IR/MatrixBuilder.h
+++ b/llvm/include/llvm/IR/MatrixBuilder.h
@@ -72,7 +72,7 @@ class MatrixBuilder {
                     B.getInt32(Columns)};
     Type *OverloadedTypes[] = {RetType, Stride->getType()};
 
-    Function *TheFn = Intrinsic::getDeclaration(
+    Function *TheFn = Intrinsic::getOrInsertDeclaration(
         getModule(), Intrinsic::matrix_column_major_load, OverloadedTypes);
 
     CallInst *Call = B.CreateCall(TheFn->getFunctionType(), TheFn, Ops, Name);
@@ -95,7 +95,7 @@ class MatrixBuilder {
                     B.getInt32(Rows), B.getInt32(Columns)};
     Type *OverloadedTypes[] = {Matrix->getType(), Stride->getType()};
 
-    Function *TheFn = Intrinsic::getDeclaration(
+    Function *TheFn = Intrinsic::getOrInsertDeclaration(
         getModule(), Intrinsic::matrix_column_major_store, OverloadedTypes);
 
     CallInst *Call = B.CreateCall(TheFn->getFunctionType(), TheFn, Ops, Name);
@@ -115,7 +115,7 @@ class MatrixBuilder {
 
     Type *OverloadedTypes[] = {ReturnType};
     Value *Ops[] = {Matrix, B.getInt32(Rows), B.getInt32(Columns)};
-    Function *TheFn = Intrinsic::getDeclaration(
+    Function *TheFn = Intrinsic::getOrInsertDeclaration(
         getModule(), Intrinsic::matrix_transpose, OverloadedTypes);
 
     return B.CreateCall(TheFn->getFunctionType(), TheFn, Ops, Name);
@@ -136,7 +136,7 @@ class MatrixBuilder {
                     B.getInt32(RHSColumns)};
     Type *OverloadedTypes[] = {ReturnType, LHSType, RHSType};
 
-    Function *TheFn = Intrinsic::getDeclaration(
+    Function *TheFn = Intrinsic::getOrInsertDeclaration(
         getModule(), Intrinsic::matrix_multiply, OverloadedTypes);
     return B.CreateCall(TheFn->getFunctionType(), TheFn, Ops, Name);
   }
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index c3b4a8235ce637..5b9bddeb7cfe82 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -360,7 +360,7 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) {
                                               OverloadTys))
           return error(Info.second, "invalid intrinsic signature");
 
-        U.set(Intrinsic::getDeclaration(M, IID, OverloadTys));
+        U.set(Intrinsic::getOrInsertDeclaration(M, IID, OverloadTys));
       }
 
       Info.first->eraseFromParent();
diff --git a/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp b/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp
index 11f123aa5bed85..0a3d0cf8ec9300 100644
--- a/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp
+++ b/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp
@@ -356,7 +356,7 @@ static void expandIToFP(Instruction *IToFP) {
   Entry->getTerminator()->eraseFromParent();
 
   Function *CTLZ =
-      Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz, IntTy);
+      Intrinsic::getOrInsertDeclaration(F->getParent(), Intrinsic::ctlz, IntTy);
   ConstantInt *True = Builder.getTrue();
 
   // entry:
diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/CodeGen/ExpandMemCmp.cpp
index 04222d5b4afd4c..6d626de0b4e635 100644
--- a/llvm/lib/CodeGen/ExpandMemCmp.cpp
+++ b/llvm/lib/CodeGen/ExpandMemCmp.cpp
@@ -355,7 +355,7 @@ MemCmpExpansion::LoadPair MemCmpExpansion::getLoadPair(Type *LoadSizeType,
 
   // Swap bytes if required.
   if (BSwapSizeType) {
-    Function *Bswap = Intrinsic::getDeclaration(
+    Function *Bswap = Intrinsic::getOrInsertDeclaration(
         CI->getModule(), Intrinsic::bswap, BSwapSizeType);
     Lhs = Builder.CreateCall(Bswap, Lhs);
     Rhs = Builder.CreateCall(Bswap, Rhs);
diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
index ffe879ff049648..32ba3e91822ddb 100644
--- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp
+++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
@@ -237,7 +237,7 @@ Value *CachingVPExpander::convertEVLToMask(IRBuilder<> &Builder,
   if (ElemCount.isScalable()) {
     auto *M = Builder.GetInsertBlock()->getModule();
     Type *BoolVecTy = VectorType::get(Builder.getInt1Ty(), ElemCount);
-    Function *ActiveMaskFunc = Intrinsic::getDeclaration(
+    Function *ActiveMaskFunc = Intrinsic::getOrInsertDeclaration(
         M, Intrinsic::get_active_lane_mask, {BoolVecTy, EVLParam->getType()});
     // `get_active_lane_mask` performs an implicit less-than comparison.
     Value *ConstZero = Builder.getInt32(0);
@@ -299,7 +299,7 @@ Value *CachingVPExpander::expandPredicationToIntCall(
   case Intrinsic::umin: {
     Value *Op0 = VPI.getOperand(0);
     Value *Op1 = VPI.getOperand(1);
-    Function *Fn = Intrinsic::getDeclaration(
+    Function *Fn = Intrinsic::getOrInsertDeclaration(
         VPI.getModule(), UnpredicatedIntrinsicID, {VPI.getType()});
     Value *NewOp = Builder.CreateCall(Fn, {Op0, Op1}, VPI.getName());
     replaceOperation(*NewOp, VPI);
@@ -308,7 +308,7 @@ Value *CachingVPExpander::expandPredicationToIntCall(
   case Intrinsic::bswap:
   case Intrinsic::bitreverse: {
     Value *Op = VPI.getOperand(0);
-    Function *Fn = Intrinsic::getDeclaration(
+    Function *Fn = Intrinsic::getOrInsertDeclaration(
         VPI.getModule(), UnpredicatedIntrinsicID, {VPI.getType()});
     Value *NewOp = Builder.CreateCall(Fn, {Op}, VPI.getName());
     replaceOperation(*NewOp, VPI);
@@ -327,7 +327,7 @@ Value *CachingVPExpander::expandPredicationToFPCall(
   case Intrinsic::fabs:
   case Intrinsic::sqrt: {
     Value *Op0 = VPI.getOperand(0);
-    Function *Fn = Intrinsic::getDeclaration(
+    Function *Fn = Intrinsic::getOrInsertDeclaration(
         VPI.getModule(), UnpredicatedIntrinsicID, {VPI.getType()});
     Value *NewOp = Builder.CreateCall(Fn, {Op0}, VPI.getName());
     replaceOperation(*NewOp, VPI);
@@ -337,7 +337,7 @@ Value *CachingVPExpander::expandPredicationToFPCall(
   case Intrinsic::minnum: {
     Value *Op0 = VPI.getOperand(0);
     Value *Op1 = VPI.getOperand(1);
-    Function *Fn = Intrinsic::getDeclaration(
+    Function *Fn = Intrinsic::getOrInsertDeclaration(
         VPI.getModule(), UnpredicatedIntrinsicID, {VPI.getType()});
     Value *NewOp = Builder.CreateCall(Fn, {Op0, Op1}, VPI.getName());
     replaceOperation(*NewOp, VPI);
@@ -350,7 +350,7 @@ Value *CachingVPExpander::expandPredicationToFPCall(
     Value *Op0 = VPI.getOperand(0);
     Value *Op1 = VPI.getOperand(1);
     Value *Op2 = VPI.getOperand(2);
-    Function *Fn = Intrinsic::getDeclaration(
+    Function *Fn = Intrinsic::getOrInsertDeclaration(
         VPI.getModule(), UnpredicatedIntrinsicID, {VPI.getType()});
     Value *NewOp;
     if (Intrinsic::isConstrainedFPIntrinsic(UnpredicatedIntrinsicID))
@@ -594,7 +594,7 @@ bool CachingVPExpander::discardEVLParameter(VPIntrinsic &VPI) {
     // TODO add caching
     auto *M = VPI.getModule();
     Function *VScaleFunc =
-        Intrinsic::getDeclaration(M, Intrinsic::vscale, Int32Ty);
+        Intrinsic::getOrInsertDeclaration(M, Intrinsic::vscale, Int32Ty);
     IRBuilder<> Builder(VPI.getParent(), VPI.getIterator());
     Value *FactorConst = Builder.getInt32(StaticElemCount.getKnownMinValue());
     Value *VScale = Builder.CreateCall(VScaleFunc, {}, "vscale");
diff --git a/llvm/lib/CodeGen/HardwareLoops.cpp b/llvm/lib/CodeGen/HardwareLoops.cpp
index 9205eabcf5684e..c8a63304a3b63b 100644
--- a/llvm/lib/CodeGen/HardwareLoops.cpp
+++ b/llvm/lib/CodeGen/HardwareLoops.cpp
@@ -512,7 +512,7 @@ Value* HardwareLoop::InsertIterationSetup(Value *LoopCountInit) {
                                    : Intrinsic::test_set_loop_iterations)
                          : (UsePhi ? Intrinsic::start_loop_iterations
                                    : Intrinsic::set_loop_iterations);
-  Function *LoopIter = Intrinsic::getDeclaration(M, ID, Ty);
+  Function *LoopIter = Intrinsic::getOrInsertDeclaration(M, ID, Ty);
   Value *LoopSetup = Builder.CreateCall(LoopIter, LoopCountInit);
 
   // Use the return value of the intrinsic to control the entry of the loop.
@@ -541,9 +541,8 @@ void HardwareLoop::InsertLoopDec() {
           Attribute::StrictFP))
     CondBuilder.setIsFPConstrained(true);
 
-  Function *DecFunc =
-    Intrinsic::getDeclaration(M, Intrinsic::loop_decrement,
-                              LoopDecrement->getType());
+  Function *DecFunc = Intrinsic::getOrInsertDeclaration(
+      M, Intrinsic::loop_decrement, LoopDecrement->getType());
   Value *Ops[] = { LoopDecrement };
   Value *NewCond = CondBuilder.CreateCall(DecFunc, Ops);
   Value *OldCond = ExitBranch->getCondition();
@@ -566,9 +565,8 @@ Instruction* HardwareLoop::InsertLoopRegDec(Value *EltsRem) {
           Attribute::StrictFP))
     CondBuilder.setIsFPConstrained(true);
 
-  Function *DecFunc =
-      Intrinsic::getDeclaration(M, Intrinsic::loop_decrement_reg,
-                                { EltsRem->getType() });
+  Function *DecFunc = Intrinsic::getOrInsertDeclaration(
+      M, Intrinsic::loop_decrement_reg, {EltsRem->getType()});
   Value *Ops[] = { EltsRem, LoopDecrement };
   Value *Call = CondBuilder.CreateCall(DecFunc, Ops);
 
diff --git a/llvm/lib/CodeGen/IntrinsicLowering.cpp b/llvm/lib/CodeGen/IntrinsicLowering.cpp
index 256c081b46e262..f799a8cfc1ba7e 100644
--- a/llvm/lib/CodeGen/IntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/IntrinsicLowering.cpp
@@ -474,7 +474,7 @@ bool IntrinsicLowering::LowerToByteSwap(CallInst *CI) {
 
   // Okay, we can do this xform, do so now.
   Module *M = CI->getModule();
-  Function *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Ty);
+  Function *Int = Intrinsic::getOrInsertDeclaration(M, Intrinsic::bswap, Ty);
 
   Value *Op = CI->getArgOperand(0);
   Op = CallInst::Create(Int, Op, CI->getName(), CI->getIterator());
diff --git a/llvm/lib/CodeGen/SafeStack.cpp b/llvm/lib/CodeGen/SafeStack.cpp
index e41d1bfb0e530d..a50909af8bfcfb 100644
--- a/llvm/lib/CodeGen/SafeStack.cpp
+++ b/llvm/lib/CodeGen/SafeStack.cpp
@@ -368,7 +368,8 @@ Value *SafeStack::getStackGuard(IRBuilder<> &IRB, Function &F) {
 
   if (!StackGuardVar) {
     TL.insertSSPDeclarations(*M);
-    return IRB.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::stackguard));
+    return IRB.CreateCall(
+        Intrinsic::getOrInsertDeclaration(M, Intrinsic::stackguard));
   }
 
   return IRB.CreateLoad(StackPtrTy, StackGuardVar, "StackGuard");
diff --git a/llvm/lib/CodeGen/SjLjEHPrepare.cpp b/llvm/lib/CodeGen/SjLjEHPrepare.cpp
index 054f7d7215962e..c4ad9f0b2172fc 100644
--- a/llvm/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/llvm/lib/CodeGen/SjLjEHPrepare.cpp
@@ -508,17 +508,19 @@ bool SjLjEHPrepareImpl::runOnFunction(Function &F) {
 
   PointerType *AllocaPtrTy = M.getDataLayout().getAllocaPtrType(M.getContext());
 
-  FrameAddrFn =
-      Intrinsic::getDeclaration(&M, Intrinsic::frameaddress, {AllocaPtrTy});
-  StackAddrFn =
-      Intrinsic::getDeclaration(&M, Intrinsic::stacksave, {AllocaPtrTy});
-  StackRestoreFn =
-      Intrinsic::getDeclaration(&M, Intrinsic::stackrestore, {AllocaPtrTy});
+  FrameAddrFn = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::frameaddress,
+                                                  {AllocaPtrTy});
+  StackAddrFn = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::stacksave,
+                                                  {AllocaPtrTy});
+  StackRestoreFn = Intrinsic::getOrInsertDeclaration(
+      &M, Intrinsic::stackrestore, {AllocaPtrTy});
   BuiltinSetupDispatchFn =
-    Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_setup_dispatch);
-  LSDAAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_lsda);
-  CallSiteFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_callsite);
-  FuncCtxFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_functioncontext);
+      Intrinsic::getOrInsertDeclaration(&M, Intrinsic::eh_sjlj_setup_dispatch);
+  LSDAAddrFn = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::eh_sjlj_lsda);
+  CallSiteFn =
+      Intrinsic::getOrInsertDeclaration(&M, Intrinsic::eh_sjlj_callsite);
+  FuncCtxFn =
+      Intrinsic::getOrInsertDeclaration(&M, Intrinsic::eh_sjlj_functioncontext);
 
   bool Res = setupEntryBlockAndCallSites(F);
   return Res;
diff --git a/llvm/lib/CodeGen/StackProtector.cpp b/llvm/lib/CodeGen/StackProtector.cpp
index 1f23838b2de0ca..a192161bbd9481 100644
--- a/llvm/lib/CodeGen/StackProtector.cpp
+++ b/llvm/lib/CodeGen/StackProtector.cpp
@@ -519,7 +519,8 @@ static Value *getStackGuard(const TargetLoweringBase *TLI, Module *M,
   if (SupportsSelectionDAGSP)
     *SupportsSelectionDAGSP = true;
   TLI->insertSSPDeclarations(*M);
-  return B.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::stackguard));
+  return B.CreateCall(
+      Intrinsic::getOrInsertDeclaration(M, Intrinsic::stackguard));
 }
 
 /// Insert code into the entry block that stores the stack guard
@@ -540,7 +541,7 @@ static bool CreatePrologue(Function *F, Module *M, Instruction *CheckLoc,
   AI = B.CreateAlloca(PtrTy, nullptr, "StackGuardSlot");
 
   Value *GuardSlot = getStackGuard(TLI, M, B, &SupportsSelectionDAGSP);
-  B.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::stackprotector),
+  B.CreateCall(Intrinsic::getOrInsertDeclaration(M, Intrinsic::stackprotector),
                {GuardSlot, AI});
   return SupportsSelectionDAGSP;
 }
diff --git a/llvm/lib/CodeGen/WasmEHPrepare.cpp b/llvm/lib/CodeGen/WasmEHPrepare.cpp
index 7514d49fb18a98..1701b0d04425d2 100644
--- a/llvm/lib/CodeGen/WasmEHPrepare.cpp
+++ b/llvm/lib/CodeGen/WasmEHPrepare.cpp
@@ -196,7 +196,7 @@ bool WasmEHPrepareImpl::prepareThrows(Function &F) {
   bool Changed = false;
 
   // wasm.throw() intinsic, which will be lowered to wasm 'throw' instruction.
-  ThrowF = Intrinsic::getDeclaration(&M, Intrinsic::wasm_throw);
+  ThrowF = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::wasm_throw);
   // Insert an unreachable instruction after a call to @llvm.wasm.throw and
   // delete all following instructions within the BB, and delete all the dead
   // children of the BB as well.
@@ -260,18 +260,21 @@ bool WasmEHPrepareImpl::prepareEHPads(Function &F) {
                                                  0, 2, "selector_gep");
 
   // wasm.landingpad.index() intrinsic, which is to specify landingpad index
-  LPadIndexF = Intrinsic::getDeclaration(&M, Intrinsic::wasm_landingpad_index);
+  LPadIndexF =
+      Intrinsic::getOrInsertDeclaration(&M, Intrinsic::wasm_landingpad_index);
   // wasm.lsda() intrinsic. Returns the address of LSDA table for the current
   // function.
-  LSDAF = Intrinsic::getDeclaration(&M, Intrinsic::wasm_lsda);
+  LSDAF = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::wasm_lsda);
   // wasm.get.exception() and wasm.get.ehselector() intrinsics. Calls to these
   // are generated in clang.
-  GetExnF = Intrinsic::getDeclaration(&M, Intrinsic::wasm_get_exception);
-  GetSelectorF = Intrinsic::getDeclaration(&M, Intrinsic::wasm_get_ehselector);
+  GetExnF =
+      Intrinsic::getOrInsertDeclaration(&M, Intrinsic::wasm_get_exception);
+  GetSelectorF =
+      Intrinsic::getOrInsertDeclaration(&M, Intrinsic::wasm_get_ehselector);
 
   // wasm.catch() will be lowered down to wasm 'catch' instruction in
   // instruction selection.
-  CatchF = Intrinsic::getDeclaration(&M, Intrinsic::wasm_catch);
+  CatchF = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::wasm_catch);
 
   // _Unwind_CallPersonality() wrapper function, which calls the personality
   CallPersonalityF = M.getOrInsertFunction("_Unwind_CallPersonality",
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 215bfc8c6cfe3e..477b77a6dd5335 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -65,7 +65,7 @@ static bool upgradePTESTIntrinsic(Function *F, Intrinsic::ID IID,
 
   // Yes, it's old, replace it with new version.
   rename(F);
-  NewFn = Intrinsic::getDeclaration(F->getParent(), IID);
+  NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID);
   return true;
 }
 
@@ -81,7 +81,7 @@ static bool upgradeX86IntrinsicsWith8BitMask(Function *F, Intrinsic::ID IID,
 
   // Move this function aside and map down.
   rename(F);
-  NewFn = Intrinsic::getDeclaration(F->getParent(), IID);
+  NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID);
   return true;
 }
 
@@ -94,7 +94,7 @@ static bool upgradeX86MaskedFPCompare(Function *F, Intrinsic::ID IID,
     return false;
 
   rename(F);
-  NewFn = Intrinsic::getDeclaration(F->getParent(), IID);
+  NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID);
   return true;
 }
 
@@ -104,7 +104,7 @@ static bool upgradeX86BF16Intrinsic(Function *F, Intrinsic::ID IID,
     return false;
 
   rename(F);
-  NewFn = Intrinsic::getDeclaration(F->getParent(), IID);
+  NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID);
   return true;
 }
 
@@ -114,7 +114,7 @@ static bool upgradeX86BF16DPIntrinsic(Function *F, Intrinsic::ID IID,
     return false;
 
   rename(F);
-  NewFn = Intrinsic::getDeclaration(F->getParent(), IID);
+  NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID);
   return true;
 }
 
@@ -502,8 +502,8 @@ static bool upgradeX86IntrinsicFunction(Function *F, StringRef Name,
       return false;
 
     rename(F);
-    NewFn = Intrinsic::getDeclaration(F->getParent(),
-                                      Intrinsic::x86_rdtscp);
+    NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(),
+                                              Intrinsic::x86_rdtscp);
     return true;
   }
 
@@ -609,14 +609,15 @@ static bool upgradeX86IntrinsicFunction(Function *F, StringRef Name,
 
     if (ID != Intrinsic::not_intrinsic) {
       rename(F);
-      NewFn = Intrinsic::getDeclaration(F->getParent(), ID);
+      NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID);
       return true;
     }
     return false; // No other 'x86.xop.*'
   }
 
   if (Name == "seh.recoverfp") {
-    NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::eh_recoverfp);
+    NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(),
+                                              Intrinsic::eh_recoverfp);
     return true;
   }
 
@@ -630,15 +631,15 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
                                                  Function *&NewFn) {
   if (Name.starts_with("rbit")) {
     // '(arm|aarch64).rbit'.
-    NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::bitreverse,
-                                      F->arg_begin()->getType());
+    NewFn = Intrinsic::getOrInsertDeclaration(
+        F->getParent(), Intrinsic::bitreverse, F->arg_begin()->getType());
     return true;
   }
 
   if (Name == "thread.pointer") {
     // '(arm|aarch64).thread.pointer'.
-    NewFn =
-        Intrinsic::getDeclaration(F->getParent(), Intrinsic::thread_pointer);
+    NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(),
+                                              Intrinsic::thread_pointer);
     return true;
   }
 
@@ -663,7 +664,7 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
         std::array<Type *, 2> Tys{
             {F->getReturnType(),
              FixedVectorType::get(Type::getBFloatTy(Ctx), OperandWidth / 16)}};
-        NewFn = Intrinsic::getDeclaration(F->getParent(), ID, Tys);
+        NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, Tys);
         return true;
       }
       return false; // No other '(arm|aarch64).neon.bfdot.*'.
@@ -688,7 +689,7 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
                             : (Intrinsic::ID)Intrinsic::aarch64_neon_bfmlalt)
                 .Default(Intrinsic::not_intrinsic);
         if (ID != Intrinsic::not_intrinsic) {
-          NewFn = Intrinsic::getDeclaration(F->getParent(), ID);
+          NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID);
           return true;
         }
         return false; // No other '(arm|aarch64).neon.bfm*.v16i8'.
@@ -712,8 +713,8 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
                              .StartsWith("vqsubu.", Intrinsic::usub_sat)
                              .Default(Intrinsic::not_intrinsic);
       if (ID != Intrinsic::not_intrinsic) {
-        NewFn = Intrinsic::getDeclaration(F->getParent(), ID,
-                                          F->arg_begin()->getType());
+        NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID,
+                                                  F->arg_begin()->getType());
         return true;
       }
 
@@ -733,10 +734,10 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
           auto fArgs = F->getFunctionType()->params();
           Type *Tys[] = {fArgs[0], fArgs[1]};
           if (Groups[1].size() == 1)
-            NewFn = Intrinsic::getDeclaration(F->getParent(),
-                                              StoreInts[fArgs.size() - 3], Tys);
+            NewFn = Intrinsic::getOrInsertDeclaration(
+                F->getParent(), StoreInts[fArgs.size() - 3], Tys);
           else
-            NewFn = Intrinsic::getDeclaration(
+            NewFn = Intrinsic::getOrInsertDeclaration(
                 F->getParent(), StoreLaneInts[fArgs.size() - 5], Tys);
           return true;
         }
@@ -810,8 +811,8 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
                              .StartsWith("rbit", Intrinsic::bitreverse)
                              .Default(Intrinsic::not_intrinsic);
       if (ID != Intrinsic::not_intrinsic) {
-        NewFn = Intrinsic::getDeclaration(F->getParent(), ID,
-                                          F->arg_begin()->getType());
+        NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID,
+                                                  F->arg_begin()->getType());
         return true;
       }
 
@@ -821,8 +822,8 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
           return false; // Invalid IR.
         VectorType *Ty = dyn_cast<VectorType>(F->getReturnType());
         if (Ty && Ty->getElementType()->isFloatingPointTy()) {
-          NewFn = Intrinsic::getDeclaration(F->getParent(),
-                                            Intrinsic::aarch64_neon_faddp, Ty);
+          NewFn = Intrinsic::getOrInsertDeclaration(
+              F->getParent(), Intrinsic::aarch64_neon_faddp, Ty);
           return true;
         }
       }
@@ -840,7 +841,7 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
                   .Case("mlalt", Intrinsic::aarch64_sve_bfmlalt_lane_v2)
                   .Default(Intrinsic::not_intrinsic);
           if (ID != Intrinsic::not_intrinsic) {
-            NewFn = Intrinsic::getDeclaration(F->getParent(), ID);
+            NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID);
             return true;
           }
           return false; // No other 'aarch64.sve.bf*.lane'.
@@ -861,8 +862,8 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
 
         auto Args = F->getFunctionType()->params();
         Type *Tys[] = {F->getReturnType(), Args[1]};
-        NewFn = Intrinsic::getDeclaration(F->getParent(),
-                                          Intrinsic::aarch64_sve_faddqv, Tys);
+        NewFn = Intrinsic::getOrInsertDeclaration(
+            F->getParent(), Intrinsic::aarch64_sve_faddqv, Tys);
         return true;
       }
 
@@ -880,8 +881,8 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
               Intrinsic::aarch64_sve_ld3_sret,
               Intrinsic::aarch64_sve_ld4_sret,
           };
-          NewFn = Intrinsic::getDeclaration(F->getParent(),
-                                            LoadIDs[Name[0] - '2'], Ty);
+          NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(),
+                                                    LoadIDs[Name[0] - '2'], Ty);
           return true;
         }
         return false; // No other 'aarch64.sve.ld*'.
@@ -892,8 +893,8 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
         if (Name.starts_with("get")) {
           // 'aarch64.sve.tuple.get*'.
           Type *Tys[] = {F->getReturnType(), F->arg_begin()->getType()};
-          NewFn = Intrinsic::getDeclaration(F->getParent(),
-                                            Intrinsic::vector_extract, Tys);
+          NewFn = Intrinsic::getOrInsertDeclaration(
+              F->getParent(), Intrinsic::vector_extract, Tys);
           return true;
         }
 
@@ -901,8 +902,8 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
           // 'aarch64.sve.tuple.set*'.
           auto Args = F->getFunctionType()->params();
           Type *Tys[] = {Args[0], Args[2], Args[1]};
-          NewFn = Intrinsic::getDeclaration(F->getParent(),
-                                            Intrinsic::vector_insert, Tys);
+          NewFn = Intrinsic::getOrInsertDeclaration(
+              F->getParent(), Intrinsic::vector_insert, Tys);
           return true;
         }
 
@@ -911,8 +912,8 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
           // 'aarch64.sve.tuple.create*'.
           auto Args = F->getFunctionType()->params();
           Type *Tys[] = {F->getReturnType(), Args[1]};
-          NewFn = Intrinsic::getDeclaration(F->getParent(),
-                                            Intrinsic::vector_insert, Tys);
+          NewFn = Intrinsic::getOrInsertDeclaration(
+              F->getParent(), Intrinsic::vector_insert, Tys);
           return true;
         }
         return false; // No other 'aarch64.sve.tuple.*'.
@@ -1026,8 +1027,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
     if (Name.consume_front("amdgcn.")) {
       if (Name == "alignbit") {
         // Target specific intrinsic became redundant
-        NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::fshr,
-                                          {F->getReturnType()});
+        NewFn = Intrinsic::getOrInsertDeclaration(
+            F->getParent(), Intrinsic::fshr, {F->getReturnType()});
         return true;
       }
 
@@ -1056,9 +1057,9 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
 
       if (Name.starts_with("ldexp.")) {
         // Target specific intrinsic became redundant
-        NewFn = Intrinsic::getDeclaration(
-          F->getParent(), Intrinsic::ldexp,
-          {F->getReturnType(), F->getArg(1)->getType()});
+        NewFn = Intrinsic::getOrInsertDeclaration(
+            F->getParent(), Intrinsic::ldexp,
+            {F->getReturnType(), F->getArg(1)->getType()});
         return true;
       }
       break; // No other 'amdgcn.*'
@@ -1074,15 +1075,16 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
                              .Default(Intrinsic::not_intrinsic);
       if (ID != Intrinsic::not_intrinsic) {
         rename(F);
-        NewFn = Intrinsic::getDeclaration(F->getParent(), ID,
-                                          F->arg_begin()->getType());
+        NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID,
+                                                  F->arg_begin()->getType());
         return true;
       }
     }
 
     if (F->arg_size() == 2 && Name == "coro.end") {
       rename(F);
-      NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::coro_end);
+      NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(),
+                                                Intrinsic::coro_end);
       return true;
     }
 
@@ -1105,7 +1107,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
       // converted to DbgVariableRecords later.
       if (Name == "addr" || (Name == "value" && F->arg_size() == 4)) {
         rename(F);
-        NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::dbg_value);
+        NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(),
+                                                  Intrinsic::dbg_value);
         return true;
       }
       break; // No other 'dbg.*'.
@@ -1135,7 +1138,7 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
           // Inserting overloads the inserted type.
           Tys.push_back(FT->getParamType(1));
         rename(F);
-        NewFn = Intrinsic::getDeclaration(F->getParent(), ID, Tys);
+        NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, Tys);
         return true;
       }
 
@@ -1171,8 +1174,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
         if (ID != Intrinsic::not_intrinsic) {
           rename(F);
           auto Args = F->getFunctionType()->params();
-          NewFn =
-              Intrinsic::getDeclaration(F->getParent(), ID, {Args[V2 ? 1 : 0]});
+          NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID,
+                                                    {Args[V2 ? 1 : 0]});
           return true;
         }
         break; // No other 'expermental.vector.reduce.*'.
@@ -1182,15 +1185,16 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
     if (Name.consume_front("experimental.stepvector.")) {
       Intrinsic::ID ID = Intrinsic::stepvector;
       rename(F);
-      NewFn = Intrinsic::getDeclaration(F->getParent(), ID,
-                                        F->getFunctionType()->getReturnType());
+      NewFn = Intrinsic::getOrInsertDeclaration(
+          F->getParent(), ID, F->getFunctionType()->getReturnType());
       return true;
     }
     break; // No other 'e*'.
   case 'f':
     if (Name.starts_with("flt.rounds")) {
       rename(F);
-      NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::get_rounding);
+      NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(),
+                                                Intrinsic::get_rounding);
       return true;
     }
     break;
@@ -1200,8 +1204,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
       auto Args = F->getFunctionType()->params();
       Type* ObjectPtr[1] = {Args[0]};
       rename(F);
-      NewFn = Intrinsic::getDeclaration(F->getParent(),
-          Intrinsic::launder_invariant_group, ObjectPtr);
+      NewFn = Intrinsic::getOrInsertDeclaration(
+          F->getParent(), Intrinsic::launder_invariant_group, ObjectPtr);
       return true;
     }
     break;
@@ -1218,7 +1222,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
         // Get the types of dest, src, and len
         ArrayRef<Type *> ParamTypes =
             F->getFunctionType()->params().slice(0, 3);
-        NewFn = Intrinsic::getDeclaration(F->getParent(), ID, ParamTypes);
+        NewFn =
+            Intrinsic::getOrInsertDeclaration(F->getParent(), ID, ParamTypes);
         return true;
       }
     }
@@ -1230,8 +1235,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
           FT->getParamType(0), // Dest
           FT->getParamType(2)  // len
       };
-      NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::memset,
-                                        ParamTypes);
+      NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(),
+                                                Intrinsic::memset, ParamTypes);
       return true;
     }
     break;
@@ -1247,8 +1252,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
                 .Case("popc.i", Intrinsic::ctpop)
                 .Default(Intrinsic::not_intrinsic);
         if (IID != Intrinsic::not_intrinsic) {
-          NewFn = Intrinsic::getDeclaration(F->getParent(), IID,
-                                            {F->getReturnType()});
+          NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID,
+                                                    {F->getReturnType()});
           return true;
         }
       }
@@ -1316,8 +1321,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
           F->getName() !=
               Intrinsic::getName(Intrinsic::objectsize, Tys, F->getParent())) {
         rename(F);
-        NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::objectsize,
-                                          Tys);
+        NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(),
+                                                  Intrinsic::objectsize, Tys);
         return true;
       }
     }
@@ -1326,7 +1331,7 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
   case 'p':
     if (Name.starts_with("ptr.annotation.") && F->arg_size() == 4) {
       rename(F);
-      NewFn = Intrinsic::getDeclaration(
+      NewFn = Intrinsic::getOrInsertDeclaration(
           F->getParent(), Intrinsic::ptr_annotation,
           {F->arg_begin()->getType(), F->getArg(1)->getType()});
       return true;
@@ -1345,7 +1350,7 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
       if (ID != Intrinsic::not_intrinsic) {
         if (!F->getFunctionType()->getParamType(2)->isIntegerTy(32)) {
           rename(F);
-          NewFn = Intrinsic::getDeclaration(F->getParent(), ID);
+          NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID);
           return true;
         }
         break; // No other applicable upgrades.
@@ -1359,7 +1364,7 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
         if (!F->getFunctionType()->getParamType(2)->isIntegerTy(32) ||
             F->getFunctionType()->getReturnType()->isIntegerTy(64)) {
           rename(F);
-          NewFn = Intrinsic::getDeclaration(F->getParent(), ID);
+          NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID);
           return true;
         }
         break; // No other applicable upgrades.
@@ -1376,7 +1381,7 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
       if (ID != Intrinsic::not_intrinsic) {
         if (F->getFunctionType()->getReturnType()->isIntegerTy(64)) {
           rename(F);
-          NewFn = Intrinsic::getDeclaration(F->getParent(), ID);
+          NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID);
           return true;
         }
         break; // No other applicable upgrades.
@@ -1395,7 +1400,7 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
   case 'v': {
     if (Name == "var.annotation" && F->arg_size() == 4) {
       rename(F);
-      NewFn = Intrinsic::getDeclaration(
+      NewFn = Intrinsic::getOrInsertDeclaration(
           F->getParent(), Intrinsic::var_annotation,
           {{F->arg_begin()->getType(), F->getArg(1)->getType()}});
       return true;
@@ -1413,8 +1418,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
               .Default(Intrinsic::not_intrinsic);
       if (ID != Intrinsic::not_intrinsic) {
         rename(F);
-        NewFn =
-            Intrinsic::getDeclaration(F->getParent(), ID, F->getReturnType());
+        NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID,
+                                                  F->getReturnType());
         return true;
       }
 
@@ -1426,7 +1431,7 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
                  .Default(Intrinsic::not_intrinsic);
         if (ID != Intrinsic::not_intrinsic) {
           rename(F);
-          NewFn = Intrinsic::getDeclaration(F->getParent(), ID);
+          NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID);
           return true;
         }
         break; // No other 'wasm.dot.i8x16.i7x16.*'.
@@ -1740,8 +1745,8 @@ static Value *upgradeX86VPERMT2Intrinsics(IRBuilder<> &Builder, CallBase &CI,
   if (!IndexForm)
     std::swap(Args[0], Args[1]);
 
-  Value *V = Builder.CreateCall(Intrinsic::getDeclaration(CI.getModule(), IID),
-                                Args);
+  Value *V = Builder.CreateCall(
+      Intrinsic::getOrInsertDeclaration(CI.getModule(), IID), Args);
   Value *PassThru = ZeroMask ? ConstantAggregateZero::get(Ty)
                              : Builder.CreateBitCast(CI.getArgOperand(1),
                                                      Ty);
@@ -1753,7 +1758,7 @@ static Value *upgradeX86BinaryIntrinsics(IRBuilder<> &Builder, CallBase &CI,
   Type *Ty = CI.getType();
   Value *Op0 = CI.getOperand(0);
   Value *Op1 = CI.getOperand(1);
-  Function *Intrin = Intrinsic::getDeclaration(CI.getModule(), IID, Ty);
+  Function *Intrin = Intrinsic::getOrInsertDeclaration(CI.getModule(), IID, Ty);
   Value *Res = Builder.CreateCall(Intrin, {Op0, Op1});
 
   if (CI.arg_size() == 4) { // For masked intrinsics.
@@ -1780,7 +1785,7 @@ static Value *upgradeX86Rotate(IRBuilder<> &Builder, CallBase &CI,
   }
 
   Intrinsic::ID IID = IsRotateRight ? Intrinsic::fshr : Intrinsic::fshl;
-  Function *Intrin = Intrinsic::getDeclaration(CI.getModule(), IID, Ty);
+  Function *Intrin = Intrinsic::getOrInsertDeclaration(CI.getModule(), IID, Ty);
   Value *Res = Builder.CreateCall(Intrin, {Src, Src, Amt});
 
   if (CI.arg_size() == 4) { // For masked intrinsics.
@@ -1850,7 +1855,7 @@ static Value *upgradeX86ConcatShift(IRBuilder<> &Builder, CallBase &CI,
   }
 
   Intrinsic::ID IID = IsShiftRight ? Intrinsic::fshr : Intrinsic::fshl;
-  Function *Intrin = Intrinsic::getDeclaration(CI.getModule(), IID, Ty);
+  Function *Intrin = Intrinsic::getOrInsertDeclaration(CI.getModule(), IID, Ty);
   Value *Res = Builder.CreateCall(Intrin, {Op0, Op1, Amt});
 
   unsigned NumArgs = CI.arg_size();
@@ -1911,7 +1916,8 @@ static Value *upgradeMaskedLoad(IRBuilder<> &Builder, Value *Ptr,
 static Value *upgradeAbs(IRBuilder<> &Builder, CallBase &CI) {
   Type *Ty = CI.getType();
   Value *Op0 = CI.getArgOperand(0);
-  Function *F = Intrinsic::getDeclaration(CI.getModule(), Intrinsic::abs, Ty);
+  Function *F =
+      Intrinsic::getOrInsertDeclaration(CI.getModule(), Intrinsic::abs, Ty);
   Value *Res = Builder.CreateCall(F, {Op0, Builder.getInt1(false)});
   if (CI.arg_size() == 3)
     Res = emitX86Select(Builder, CI.getArgOperand(2), Res, CI.getArgOperand(1));
@@ -2004,7 +2010,7 @@ static Value *upgradeMaskedCompare(IRBuilder<> &Builder, CallBase &CI,
 // Replace a masked intrinsic with an older unmasked intrinsic.
 static Value *upgradeX86MaskedShift(IRBuilder<> &Builder, CallBase &CI,
                                     Intrinsic::ID IID) {
-  Function *Intrin = Intrinsic::getDeclaration(CI.getModule(), IID);
+  Function *Intrin = Intrinsic::getOrInsertDeclaration(CI.getModule(), IID);
   Value *Rep = Builder.CreateCall(Intrin,
                                  { CI.getArgOperand(0), CI.getArgOperand(1) });
   return emitX86Select(Builder, CI.getArgOperand(3), Rep, CI.getArgOperand(2));
@@ -2263,8 +2269,8 @@ static bool upgradeAVX512MaskToSelect(StringRef Name, IRBuilder<> &Builder,
   SmallVector<Value *, 4> Args(CI.args());
   Args.pop_back();
   Args.pop_back();
-  Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI.getModule(), IID),
-                           Args);
+  Rep = Builder.CreateCall(
+      Intrinsic::getOrInsertDeclaration(CI.getModule(), IID), Args);
   unsigned NumArgs = CI.arg_size();
   Rep = emitX86Select(Builder, CI.getArgOperand(NumArgs - 1), Rep,
                       CI.getArgOperand(NumArgs - 2));
@@ -2320,8 +2326,8 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI,
     // llvm.nvvm.clz.ll returns an i32, but llvm.ctlz.i64 returns an i64.
     Value *Arg = CI->getArgOperand(0);
     Value *Ctlz = Builder.CreateCall(
-        Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz,
-                                  {Arg->getType()}),
+        Intrinsic::getOrInsertDeclaration(F->getParent(), Intrinsic::ctlz,
+                                          {Arg->getType()}),
         {Arg, Builder.getFalse()}, "ctlz");
     Rep = Builder.CreateTrunc(Ctlz, Builder.getInt32Ty(), "ctlz.trunc");
   } else if (Name == "popc.ll") {
@@ -2329,15 +2335,15 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI,
     // i64.
     Value *Arg = CI->getArgOperand(0);
     Value *Popc = Builder.CreateCall(
-        Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctpop,
-                                  {Arg->getType()}),
+        Intrinsic::getOrInsertDeclaration(F->getParent(), Intrinsic::ctpop,
+                                          {Arg->getType()}),
         Arg, "ctpop");
     Rep = Builder.CreateTrunc(Popc, Builder.getInt32Ty(), "ctpop.trunc");
   } else if (Name == "h2f") {
-    Rep = Builder.CreateCall(
-        Intrinsic::getDeclaration(F->getParent(), Intrinsic::convert_from_fp16,
-                                  {Builder.getFloatTy()}),
-        CI->getArgOperand(0), "h2f");
+    Rep = Builder.CreateCall(Intrinsic::getOrInsertDeclaration(
+                                 F->getParent(), Intrinsic::convert_from_fp16,
+                                 {Builder.getFloatTy()}),
+                             CI->getArgOperand(0), "h2f");
   } else if (Name.consume_front("bitcast.") &&
              (Name == "f2i" || Name == "i2f" || Name == "ll2d" ||
               Name == "d2ll")) {
@@ -2373,7 +2379,7 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI,
     if (IID != Intrinsic::not_intrinsic &&
         !F->getReturnType()->getScalarType()->isBFloatTy()) {
       rename(F);
-      Function *NewFn = Intrinsic::getDeclaration(F->getParent(), IID);
+      Function *NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID);
       SmallVector<Value *, 2> Args;
       for (size_t I = 0; I < NewFn->arg_size(); ++I) {
         Value *Arg = CI->getArgOperand(I);
@@ -2480,15 +2486,15 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
   } else if (Name == "sse.sqrt.ss" || Name == "sse2.sqrt.sd") {
     Value *Vec = CI->getArgOperand(0);
     Value *Elt0 = Builder.CreateExtractElement(Vec, (uint64_t)0);
-    Function *Intr = Intrinsic::getDeclaration(F->getParent(), Intrinsic::sqrt,
-                                               Elt0->getType());
+    Function *Intr = Intrinsic::getOrInsertDeclaration(
+        F->getParent(), Intrinsic::sqrt, Elt0->getType());
     Elt0 = Builder.CreateCall(Intr, Elt0);
     Rep = Builder.CreateInsertElement(Vec, Elt0, (uint64_t)0);
   } else if (Name.starts_with("avx.sqrt.p") ||
              Name.starts_with("sse2.sqrt.p") ||
              Name.starts_with("sse.sqrt.p")) {
     Rep =
-        Builder.CreateCall(Intrinsic::getDeclaration(
+        Builder.CreateCall(Intrinsic::getOrInsertDeclaration(
                                F->getParent(), Intrinsic::sqrt, CI->getType()),
                            {CI->getArgOperand(0)});
   } else if (Name.starts_with("avx512.mask.sqrt.p")) {
@@ -2499,13 +2505,13 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
                                           : Intrinsic::x86_avx512_sqrt_pd_512;
 
       Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(3)};
-      Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), IID),
-                               Args);
+      Rep = Builder.CreateCall(
+          Intrinsic::getOrInsertDeclaration(CI->getModule(), IID), Args);
     } else {
-      Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(),
-                                                         Intrinsic::sqrt,
-                                                         CI->getType()),
-                               {CI->getArgOperand(0)});
+      Rep = Builder.CreateCall(
+          Intrinsic::getOrInsertDeclaration(F->getParent(), Intrinsic::sqrt,
+                                            CI->getType()),
+          {CI->getArgOperand(0)});
     }
     Rep =
         emitX86Select(Builder, CI->getArgOperand(2), Rep, CI->getArgOperand(1));
@@ -2629,8 +2635,9 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
       break;
     }
 
-    Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
-                             {CI->getOperand(0), CI->getArgOperand(1)});
+    Rep = Builder.CreateCall(
+        Intrinsic::getOrInsertDeclaration(F->getParent(), IID),
+        {CI->getOperand(0), CI->getArgOperand(1)});
     Rep = applyX86MaskOn1BitsVec(Builder, Rep, CI->getArgOperand(2));
   } else if (Name.starts_with("avx512.mask.fpclass.p")) {
     Type *OpTy = CI->getArgOperand(0)->getType();
@@ -2652,8 +2659,9 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
     else
       llvm_unreachable("Unexpected intrinsic");
 
-    Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
-                             {CI->getOperand(0), CI->getArgOperand(1)});
+    Rep = Builder.CreateCall(
+        Intrinsic::getOrInsertDeclaration(F->getParent(), IID),
+        {CI->getOperand(0), CI->getArgOperand(1)});
     Rep = applyX86MaskOn1BitsVec(Builder, Rep, CI->getArgOperand(2));
   } else if (Name.starts_with("avx512.cmp.p")) {
     SmallVector<Value *, 4> Args(CI->args());
@@ -2681,8 +2689,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
       std::swap(Mask, Args.back());
     Args.push_back(Mask);
 
-    Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
-                             Args);
+    Rep = Builder.CreateCall(
+        Intrinsic::getOrInsertDeclaration(F->getParent(), IID), Args);
   } else if (Name.starts_with("avx512.mask.cmp.")) {
     // Integer compare intrinsics.
     unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
@@ -2776,8 +2784,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
               cast<ConstantInt>(CI->getArgOperand(3))->getZExtValue() != 4)) {
       Intrinsic::ID IID = IsUnsigned ? Intrinsic::x86_avx512_uitofp_round
                                      : Intrinsic::x86_avx512_sitofp_round;
-      Function *F =
-          Intrinsic::getDeclaration(CI->getModule(), IID, {DstTy, SrcTy});
+      Function *F = Intrinsic::getOrInsertDeclaration(CI->getModule(), IID,
+                                                      {DstTy, SrcTy});
       Rep = Builder.CreateCall(F, {Rep, CI->getArgOperand(3)});
     } else {
       Rep = IsUnsigned ? Builder.CreateUIToFP(Rep, DstTy, "cvt")
@@ -2819,7 +2827,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
     Value *MaskVec = getX86MaskVec(Builder, CI->getArgOperand(2),
                                    ResultTy->getNumElements());
 
-    Function *ELd = Intrinsic::getDeclaration(
+    Function *ELd = Intrinsic::getOrInsertDeclaration(
         F->getParent(), Intrinsic::masked_expandload, ResultTy);
     Rep = Builder.CreateCall(ELd, {Ptr, MaskVec, CI->getOperand(1)});
   } else if (Name.starts_with("avx512.mask.compress.store.")) {
@@ -2834,7 +2842,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
         getX86MaskVec(Builder, CI->getArgOperand(2),
                       cast<FixedVectorType>(ResultTy)->getNumElements());
 
-    Function *CSt = Intrinsic::getDeclaration(
+    Function *CSt = Intrinsic::getOrInsertDeclaration(
         F->getParent(), Intrinsic::masked_compressstore, ResultTy);
     Rep = Builder.CreateCall(CSt, {CI->getArgOperand(1), Ptr, MaskVec});
   } else if (Name.starts_with("avx512.mask.compress.") ||
@@ -2847,7 +2855,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
     bool IsCompress = Name[12] == 'c';
     Intrinsic::ID IID = IsCompress ? Intrinsic::x86_avx512_mask_compress
                                    : Intrinsic::x86_avx512_mask_expand;
-    Function *Intr = Intrinsic::getDeclaration(F->getParent(), IID, ResultTy);
+    Function *Intr =
+        Intrinsic::getOrInsertDeclaration(F->getParent(), IID, ResultTy);
     Rep = Builder.CreateCall(Intr,
                              {CI->getOperand(0), CI->getOperand(1), MaskVec});
   } else if (Name.starts_with("xop.vpcom")) {
@@ -2910,7 +2919,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
     bool ZeroMask = Name[11] == 'z';
     Rep = upgradeX86ConcatShift(Builder, *CI, true, ZeroMask);
   } else if (Name == "sse42.crc32.64.8") {
-    Function *CRC32 = Intrinsic::getDeclaration(
+    Function *CRC32 = Intrinsic::getOrInsertDeclaration(
         F->getParent(), Intrinsic::x86_sse42_crc32_32_8);
     Value *Trunc0 =
         Builder.CreateTrunc(CI->getArgOperand(0), Type::getInt32Ty(C));
@@ -3405,7 +3414,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
         IID = Intrinsic::x86_avx512_add_pd_512;
 
       Rep = Builder.CreateCall(
-          Intrinsic::getDeclaration(F->getParent(), IID),
+          Intrinsic::getOrInsertDeclaration(F->getParent(), IID),
           {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)});
     } else {
       Rep = Builder.CreateFAdd(CI->getArgOperand(0), CI->getArgOperand(1));
@@ -3421,7 +3430,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
         IID = Intrinsic::x86_avx512_div_pd_512;
 
       Rep = Builder.CreateCall(
-          Intrinsic::getDeclaration(F->getParent(), IID),
+          Intrinsic::getOrInsertDeclaration(F->getParent(), IID),
           {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)});
     } else {
       Rep = Builder.CreateFDiv(CI->getArgOperand(0), CI->getArgOperand(1));
@@ -3437,7 +3446,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
         IID = Intrinsic::x86_avx512_mul_pd_512;
 
       Rep = Builder.CreateCall(
-          Intrinsic::getDeclaration(F->getParent(), IID),
+          Intrinsic::getOrInsertDeclaration(F->getParent(), IID),
           {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)});
     } else {
       Rep = Builder.CreateFMul(CI->getArgOperand(0), CI->getArgOperand(1));
@@ -3453,7 +3462,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
         IID = Intrinsic::x86_avx512_sub_pd_512;
 
       Rep = Builder.CreateCall(
-          Intrinsic::getDeclaration(F->getParent(), IID),
+          Intrinsic::getOrInsertDeclaration(F->getParent(), IID),
           {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)});
     } else {
       Rep = Builder.CreateFSub(CI->getArgOperand(0), CI->getArgOperand(1));
@@ -3471,13 +3480,13 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
     Intrinsic::ID IID = MinMaxTbl[IsMin][IsDouble];
 
     Rep = Builder.CreateCall(
-        Intrinsic::getDeclaration(F->getParent(), IID),
+        Intrinsic::getOrInsertDeclaration(F->getParent(), IID),
         {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)});
     Rep =
         emitX86Select(Builder, CI->getArgOperand(3), Rep, CI->getArgOperand(2));
   } else if (Name.starts_with("avx512.mask.lzcnt.")) {
     Rep =
-        Builder.CreateCall(Intrinsic::getDeclaration(
+        Builder.CreateCall(Intrinsic::getOrInsertDeclaration(
                                F->getParent(), Intrinsic::ctlz, CI->getType()),
                            {CI->getArgOperand(0), Builder.getInt1(false)});
     Rep =
@@ -3723,10 +3732,10 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
     if (NegAcc)
       Ops[2] = Builder.CreateFNeg(Ops[2]);
 
-    Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(),
-                                                       Intrinsic::fma,
-                                                       Ops[0]->getType()),
-                             Ops);
+    Rep = Builder.CreateCall(
+        Intrinsic::getOrInsertDeclaration(CI->getModule(), Intrinsic::fma,
+                                          Ops[0]->getType()),
+        Ops);
 
     if (IsScalar)
       Rep = Builder.CreateInsertElement(CI->getArgOperand(0), Rep, (uint64_t)0);
@@ -3738,10 +3747,10 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
     Ops[1] = Builder.CreateExtractElement(Ops[1], (uint64_t)0);
     Ops[2] = Builder.CreateExtractElement(Ops[2], (uint64_t)0);
 
-    Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(),
-                                                       Intrinsic::fma,
-                                                       Ops[0]->getType()),
-                             Ops);
+    Rep = Builder.CreateCall(
+        Intrinsic::getOrInsertDeclaration(CI->getModule(), Intrinsic::fma,
+                                          Ops[0]->getType()),
+        Ops);
 
     Rep = Builder.CreateInsertElement(Constant::getNullValue(CI->getType()),
                                       Rep, (uint64_t)0);
@@ -3781,11 +3790,11 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
         IID = Intrinsic::x86_avx512_vfmadd_f64;
       else
         IID = Intrinsic::x86_avx512_vfmadd_f32;
-      Function *FMA = Intrinsic::getDeclaration(CI->getModule(), IID);
+      Function *FMA = Intrinsic::getOrInsertDeclaration(CI->getModule(), IID);
       Rep = Builder.CreateCall(FMA, Ops);
     } else {
-      Function *FMA = Intrinsic::getDeclaration(CI->getModule(), Intrinsic::fma,
-                                                A->getType());
+      Function *FMA = Intrinsic::getOrInsertDeclaration(
+          CI->getModule(), Intrinsic::fma, A->getType());
       Rep = Builder.CreateCall(FMA, {A, B, C});
     }
 
@@ -3837,11 +3846,12 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
       else
         IID = Intrinsic::x86_avx512_vfmadd_pd_512;
 
-      Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
-                               {A, B, C, CI->getArgOperand(4)});
+      Rep = Builder.CreateCall(
+          Intrinsic::getOrInsertDeclaration(F->getParent(), IID),
+          {A, B, C, CI->getArgOperand(4)});
     } else {
-      Function *FMA = Intrinsic::getDeclaration(CI->getModule(), Intrinsic::fma,
-                                                A->getType());
+      Function *FMA = Intrinsic::getOrInsertDeclaration(
+          CI->getModule(), Intrinsic::fma, A->getType());
       Rep = Builder.CreateCall(FMA, {A, B, C});
     }
 
@@ -3868,8 +3878,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
     Value *Ops[] = {CI->getArgOperand(0), CI->getArgOperand(1),
                     CI->getArgOperand(2)};
     Ops[2] = Builder.CreateFNeg(Ops[2]);
-    Rep =
-        Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID), Ops);
+    Rep = Builder.CreateCall(
+        Intrinsic::getOrInsertDeclaration(F->getParent(), IID), Ops);
   } else if (Name.starts_with("avx512.mask.vfmaddsub.p") ||
              Name.starts_with("avx512.mask3.vfmaddsub.p") ||
              Name.starts_with("avx512.maskz.vfmaddsub.p") ||
@@ -3892,16 +3902,16 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
       if (IsSubAdd)
         Ops[2] = Builder.CreateFNeg(Ops[2]);
 
-      Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
-                               Ops);
+      Rep = Builder.CreateCall(
+          Intrinsic::getOrInsertDeclaration(F->getParent(), IID), Ops);
     } else {
       int NumElts = cast<FixedVectorType>(CI->getType())->getNumElements();
 
       Value *Ops[] = {CI->getArgOperand(0), CI->getArgOperand(1),
                       CI->getArgOperand(2)};
 
-      Function *FMA = Intrinsic::getDeclaration(CI->getModule(), Intrinsic::fma,
-                                                Ops[0]->getType());
+      Function *FMA = Intrinsic::getOrInsertDeclaration(
+          CI->getModule(), Intrinsic::fma, Ops[0]->getType());
       Value *Odd = Builder.CreateCall(FMA, Ops);
       Ops[2] = Builder.CreateFNeg(Ops[2]);
       Value *Even = Builder.CreateCall(FMA, Ops);
@@ -3944,8 +3954,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
 
     Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1),
                      CI->getArgOperand(2), CI->getArgOperand(3)};
-    Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), IID),
-                             Args);
+    Rep = Builder.CreateCall(
+        Intrinsic::getOrInsertDeclaration(CI->getModule(), IID), Args);
     Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
                                : CI->getArgOperand(0);
     Rep = emitX86Select(Builder, CI->getArgOperand(4), Rep, PassThru);
@@ -3972,8 +3982,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
 
     Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1),
                      CI->getArgOperand(2)};
-    Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), IID),
-                             Args);
+    Rep = Builder.CreateCall(
+        Intrinsic::getOrInsertDeclaration(CI->getModule(), IID), Args);
     Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
                                : CI->getArgOperand(0);
     Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
@@ -4008,8 +4018,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
 
     Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1),
                      CI->getArgOperand(2)};
-    Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), IID),
-                             Args);
+    Rep = Builder.CreateCall(
+        Intrinsic::getOrInsertDeclaration(CI->getModule(), IID), Args);
     Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
                                : CI->getArgOperand(0);
     Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
@@ -4038,8 +4048,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
 
     Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1),
                      CI->getArgOperand(2)};
-    Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), IID),
-                             Args);
+    Rep = Builder.CreateCall(
+        Intrinsic::getOrInsertDeclaration(CI->getModule(), IID), Args);
     Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
                                : CI->getArgOperand(0);
     Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
@@ -4062,7 +4072,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
     Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1),
                      CI->getArgOperand(2)};
     Value *NewCall = Builder.CreateCall(
-        Intrinsic::getDeclaration(CI->getModule(), IID), Args);
+        Intrinsic::getOrInsertDeclaration(CI->getModule(), IID), Args);
 
     // Extract the second result and store it.
     Value *Data = Builder.CreateExtractValue(NewCall, 1);
@@ -4108,7 +4118,7 @@ static Value *upgradeAArch64IntrinsicCall(StringRef Name, CallBase *CI,
   Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
                                     GoodPredTy, Args[1]);
 
-  Function *NewF = Intrinsic::getDeclaration(CI->getModule(), NewID);
+  Function *NewF = Intrinsic::getOrInsertDeclaration(CI->getModule(), NewID);
   return Builder.CreateCall(NewF, Args, CI->getName());
 }
 
@@ -4117,16 +4127,17 @@ static Value *upgradeARMIntrinsicCall(StringRef Name, CallBase *CI, Function *F,
   if (Name == "mve.vctp64.old") {
     // Replace the old v4i1 vctp64 with a v2i1 vctp and predicate-casts to the
     // correct type.
-    Value *VCTP = Builder.CreateCall(
-        Intrinsic::getDeclaration(F->getParent(), Intrinsic::arm_mve_vctp64),
-        CI->getArgOperand(0), CI->getName());
+    Value *VCTP =
+        Builder.CreateCall(Intrinsic::getOrInsertDeclaration(
+                               F->getParent(), Intrinsic::arm_mve_vctp64),
+                           CI->getArgOperand(0), CI->getName());
     Value *C1 = Builder.CreateCall(
-        Intrinsic::getDeclaration(
+        Intrinsic::getOrInsertDeclaration(
             F->getParent(), Intrinsic::arm_mve_pred_v2i,
             {VectorType::get(Builder.getInt1Ty(), 2, false)}),
         VCTP);
     return Builder.CreateCall(
-        Intrinsic::getDeclaration(
+        Intrinsic::getOrInsertDeclaration(
             F->getParent(), Intrinsic::arm_mve_pred_i2v,
             {VectorType::get(Builder.getInt1Ty(), 4, false)}),
         C1);
@@ -4188,19 +4199,19 @@ static Value *upgradeARMIntrinsicCall(StringRef Name, CallBase *CI, Function *F,
       Type *Ty = Op->getType();
       if (Ty->getScalarSizeInBits() == 1) {
         Value *C1 = Builder.CreateCall(
-            Intrinsic::getDeclaration(
+            Intrinsic::getOrInsertDeclaration(
                 F->getParent(), Intrinsic::arm_mve_pred_v2i,
                 {VectorType::get(Builder.getInt1Ty(), 4, false)}),
             Op);
         Op = Builder.CreateCall(
-            Intrinsic::getDeclaration(F->getParent(),
-                                      Intrinsic::arm_mve_pred_i2v, {V2I1Ty}),
+            Intrinsic::getOrInsertDeclaration(
+                F->getParent(), Intrinsic::arm_mve_pred_i2v, {V2I1Ty}),
             C1);
       }
       Ops.push_back(Op);
     }
 
-    Function *Fn = Intrinsic::getDeclaration(F->getParent(), ID, Tys);
+    Function *Fn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, Tys);
     return Builder.CreateCall(Fn, Ops, CI->getName());
   }
   llvm_unreachable("Unknown function for ARM CallBase upgrade.");
@@ -5088,7 +5099,8 @@ void llvm::UpgradeARCRuntime(Module &M) {
     if (!Fn)
       return;
 
-    Function *NewFn = llvm::Intrinsic::getDeclaration(&M, IntrinsicFunc);
+    Function *NewFn =
+        llvm::Intrinsic::getOrInsertDeclaration(&M, IntrinsicFunc);
 
     for (User *U : make_early_inc_range(Fn->users())) {
       CallInst *CI = dyn_cast<CallInst>(U);
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index ee084e870263d0..1cf998c6850068 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -2468,7 +2468,7 @@ LLVMValueRef LLVMGetIntrinsicDeclaration(LLVMModuleRef Mod,
                                          size_t ParamCount) {
   ArrayRef<Type*> Tys(unwrap(ParamTypes), ParamCount);
   auto IID = llvm_map_to_intrinsic_id(ID);
-  return wrap(llvm::Intrinsic::getDeclaration(unwrap(Mod), IID, Tys));
+  return wrap(llvm::Intrinsic::getOrInsertDeclaration(unwrap(Mod), IID, Tys));
 }
 
 const char *LLVMIntrinsicGetName(unsigned ID, size_t *NameLength) {
diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp
index 0db82cdd6373c8..447a9d65174636 100644
--- a/llvm/lib/IR/DIBuilder.cpp
+++ b/llvm/lib/IR/DIBuilder.cpp
@@ -991,7 +991,7 @@ DbgInstPtr DIBuilder::insertDbgAssign(Instruction *LinkedInstr, Value *Val,
   LLVMContext &Ctx = LinkedInstr->getContext();
   Module *M = LinkedInstr->getModule();
   if (!AssignFn)
-    AssignFn = Intrinsic::getDeclaration(M, Intrinsic::dbg_assign);
+    AssignFn = Intrinsic::getOrInsertDeclaration(M, Intrinsic::dbg_assign);
 
   std::array<Value *, 6> Args = {
       MetadataAsValue::get(Ctx, ValueAsMetadata::get(Val)),
@@ -1060,7 +1060,7 @@ static Value *getDbgIntrinsicValueImpl(LLVMContext &VMContext, Value *V) {
 }
 
 static Function *getDeclareIntrin(Module &M) {
-  return Intrinsic::getDeclaration(&M, Intrinsic::dbg_declare);
+  return Intrinsic::getOrInsertDeclaration(&M, Intrinsic::dbg_declare);
 }
 
 DbgInstPtr DIBuilder::insertDbgValueIntrinsic(
@@ -1074,7 +1074,7 @@ DbgInstPtr DIBuilder::insertDbgValueIntrinsic(
   }
 
   if (!ValueFn)
-    ValueFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_value);
+    ValueFn = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::dbg_value);
   return insertDbgIntrinsic(ValueFn, Val, VarInfo, Expr, DL, InsertBB,
                             InsertBefore);
 }
@@ -1175,7 +1175,7 @@ DbgInstPtr DIBuilder::insertLabel(DILabel *LabelInfo, const DILocation *DL,
   }
 
   if (!LabelFn)
-    LabelFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_label);
+    LabelFn = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::dbg_label);
 
   Value *Args[] = {MetadataAsValue::get(VMContext, LabelInfo)};
 
diff --git a/llvm/lib/IR/DebugProgramInstruction.cpp b/llvm/lib/IR/DebugProgramInstruction.cpp
index 0db908211b553c..b37dbd534092c3 100644
--- a/llvm/lib/IR/DebugProgramInstruction.cpp
+++ b/llvm/lib/IR/DebugProgramInstruction.cpp
@@ -413,13 +413,13 @@ DbgVariableRecord::createDebugIntrinsic(Module *M,
   // Work out what sort of intrinsic we're going to produce.
   switch (getType()) {
   case DbgVariableRecord::LocationType::Declare:
-    IntrinsicFn = Intrinsic::getDeclaration(M, Intrinsic::dbg_declare);
+    IntrinsicFn = Intrinsic::getOrInsertDeclaration(M, Intrinsic::dbg_declare);
     break;
   case DbgVariableRecord::LocationType::Value:
-    IntrinsicFn = Intrinsic::getDeclaration(M, Intrinsic::dbg_value);
+    IntrinsicFn = Intrinsic::getOrInsertDeclaration(M, Intrinsic::dbg_value);
     break;
   case DbgVariableRecord::LocationType::Assign:
-    IntrinsicFn = Intrinsic::getDeclaration(M, Intrinsic::dbg_assign);
+    IntrinsicFn = Intrinsic::getOrInsertDeclaration(M, Intrinsic::dbg_assign);
     break;
   case DbgVariableRecord::LocationType::End:
   case DbgVariableRecord::LocationType::Any:
@@ -459,7 +459,7 @@ DbgVariableRecord::createDebugIntrinsic(Module *M,
 DbgLabelInst *
 DbgLabelRecord::createDebugIntrinsic(Module *M,
                                      Instruction *InsertBefore) const {
-  auto *LabelFn = Intrinsic::getDeclaration(M, Intrinsic::dbg_label);
+  auto *LabelFn = Intrinsic::getOrInsertDeclaration(M, Intrinsic::dbg_label);
   Value *Args[] = {
       MetadataAsValue::get(getDebugLoc()->getContext(), getLabel())};
   DbgLabelInst *DbgLabel = cast<DbgLabelInst>(
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index 8bf695e835c368..3654bf9a9e70b5 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -91,8 +91,8 @@ Value *IRBuilderBase::CreateVScale(Constant *Scaling, const Twine &Name) {
   if (cast<ConstantInt>(Scaling)->isZero())
     return Scaling;
   Module *M = GetInsertBlock()->getParent()->getParent();
-  Function *TheFn =
-      Intrinsic::getDeclaration(M, Intrinsic::vscale, {Scaling->getType()});
+  Function *TheFn = Intrinsic::getOrInsertDeclaration(M, Intrinsic::vscale,
+                                                      {Scaling->getType()});
   CallInst *CI = CreateCall(TheFn, {}, {}, Name);
   return cast<ConstantInt>(Scaling)->isOne() ? CI : CreateMul(CI, Scaling);
 }
@@ -142,7 +142,8 @@ CallInst *IRBuilderBase::CreateMemSet(Value *Ptr, Value *Val, Value *Size,
   Value *Ops[] = {Ptr, Val, Size, getInt1(isVolatile)};
   Type *Tys[] = { Ptr->getType(), Size->getType() };
   Module *M = BB->getParent()->getParent();
-  Function *TheFn = Intrinsic::getDeclaration(M, Intrinsic::memset, Tys);
+  Function *TheFn =
+      Intrinsic::getOrInsertDeclaration(M, Intrinsic::memset, Tys);
 
   CallInst *CI = CreateCall(TheFn, Ops);
 
@@ -170,7 +171,8 @@ CallInst *IRBuilderBase::CreateMemSetInline(Value *Dst, MaybeAlign DstAlign,
   Value *Ops[] = {Dst, Val, Size, getInt1(IsVolatile)};
   Type *Tys[] = {Dst->getType(), Size->getType()};
   Module *M = BB->getParent()->getParent();
-  Function *TheFn = Intrinsic::getDeclaration(M, Intrinsic::memset_inline, Tys);
+  Function *TheFn =
+      Intrinsic::getOrInsertDeclaration(M, Intrinsic::memset_inline, Tys);
 
   CallInst *CI = CreateCall(TheFn, Ops);
 
@@ -197,7 +199,7 @@ CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemSet(
   Value *Ops[] = {Ptr, Val, Size, getInt32(ElementSize)};
   Type *Tys[] = {Ptr->getType(), Size->getType()};
   Module *M = BB->getParent()->getParent();
-  Function *TheFn = Intrinsic::getDeclaration(
+  Function *TheFn = Intrinsic::getOrInsertDeclaration(
       M, Intrinsic::memset_element_unordered_atomic, Tys);
 
   CallInst *CI = CreateCall(TheFn, Ops);
@@ -227,7 +229,7 @@ CallInst *IRBuilderBase::CreateMemTransferInst(
   Value *Ops[] = {Dst, Src, Size, getInt1(isVolatile)};
   Type *Tys[] = { Dst->getType(), Src->getType(), Size->getType() };
   Module *M = BB->getParent()->getParent();
-  Function *TheFn = Intrinsic::getDeclaration(M, IntrID, Tys);
+  Function *TheFn = Intrinsic::getOrInsertDeclaration(M, IntrID, Tys);
 
   CallInst *CI = CreateCall(TheFn, Ops);
 
@@ -265,7 +267,7 @@ CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemCpy(
   Value *Ops[] = {Dst, Src, Size, getInt32(ElementSize)};
   Type *Tys[] = {Dst->getType(), Src->getType(), Size->getType()};
   Module *M = BB->getParent()->getParent();
-  Function *TheFn = Intrinsic::getDeclaration(
+  Function *TheFn = Intrinsic::getOrInsertDeclaration(
       M, Intrinsic::memcpy_element_unordered_atomic, Tys);
 
   CallInst *CI = CreateCall(TheFn, Ops);
@@ -381,7 +383,7 @@ CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemMove(
   Value *Ops[] = {Dst, Src, Size, getInt32(ElementSize)};
   Type *Tys[] = {Dst->getType(), Src->getType(), Size->getType()};
   Module *M = BB->getParent()->getParent();
-  Function *TheFn = Intrinsic::getDeclaration(
+  Function *TheFn = Intrinsic::getOrInsertDeclaration(
       M, Intrinsic::memmove_element_unordered_atomic, Tys);
 
   CallInst *CI = CreateCall(TheFn, Ops);
@@ -411,23 +413,23 @@ CallInst *IRBuilderBase::getReductionIntrinsic(Intrinsic::ID ID, Value *Src) {
   Module *M = GetInsertBlock()->getParent()->getParent();
   Value *Ops[] = {Src};
   Type *Tys[] = { Src->getType() };
-  auto Decl = Intrinsic::getDeclaration(M, ID, Tys);
+  auto Decl = Intrinsic::getOrInsertDeclaration(M, ID, Tys);
   return CreateCall(Decl, Ops);
 }
 
 CallInst *IRBuilderBase::CreateFAddReduce(Value *Acc, Value *Src) {
   Module *M = GetInsertBlock()->getParent()->getParent();
   Value *Ops[] = {Acc, Src};
-  auto Decl = Intrinsic::getDeclaration(M, Intrinsic::vector_reduce_fadd,
-                                        {Src->getType()});
+  auto Decl = Intrinsic::getOrInsertDeclaration(
+      M, Intrinsic::vector_reduce_fadd, {Src->getType()});
   return CreateCall(Decl, Ops);
 }
 
 CallInst *IRBuilderBase::CreateFMulReduce(Value *Acc, Value *Src) {
   Module *M = GetInsertBlock()->getParent()->getParent();
   Value *Ops[] = {Acc, Src};
-  auto Decl = Intrinsic::getDeclaration(M, Intrinsic::vector_reduce_fmul,
-                                        {Src->getType()});
+  auto Decl = Intrinsic::getOrInsertDeclaration(
+      M, Intrinsic::vector_reduce_fmul, {Src->getType()});
   return CreateCall(Decl, Ops);
 }
 
@@ -489,8 +491,8 @@ CallInst *IRBuilderBase::CreateLifetimeStart(Value *Ptr, ConstantInt *Size) {
            "lifetime.start requires the size to be an i64");
   Value *Ops[] = { Size, Ptr };
   Module *M = BB->getParent()->getParent();
-  Function *TheFn =
-      Intrinsic::getDeclaration(M, Intrinsic::lifetime_start, {Ptr->getType()});
+  Function *TheFn = Intrinsic::getOrInsertDeclaration(
+      M, Intrinsic::lifetime_start, {Ptr->getType()});
   return CreateCall(TheFn, Ops);
 }
 
@@ -504,8 +506,8 @@ CallInst *IRBuilderBase::CreateLifetimeEnd(Value *Ptr, ConstantInt *Size) {
            "lifetime.end requires the size to be an i64");
   Value *Ops[] = { Size, Ptr };
   Module *M = BB->getParent()->getParent();
-  Function *TheFn =
-      Intrinsic::getDeclaration(M, Intrinsic::lifetime_end, {Ptr->getType()});
+  Function *TheFn = Intrinsic::getOrInsertDeclaration(
+      M, Intrinsic::lifetime_end, {Ptr->getType()});
   return CreateCall(TheFn, Ops);
 }
 
@@ -523,8 +525,8 @@ CallInst *IRBuilderBase::CreateInvariantStart(Value *Ptr, ConstantInt *Size) {
   // Fill in the single overloaded type: memory object type.
   Type *ObjectPtr[1] = {Ptr->getType()};
   Module *M = BB->getParent()->getParent();
-  Function *TheFn =
-      Intrinsic::getDeclaration(M, Intrinsic::invariant_start, ObjectPtr);
+  Function *TheFn = Intrinsic::getOrInsertDeclaration(
+      M, Intrinsic::invariant_start, ObjectPtr);
   return CreateCall(TheFn, Ops);
 }
 
@@ -556,13 +558,13 @@ IRBuilderBase::CreateAssumption(Value *Cond,
 
   Value *Ops[] = { Cond };
   Module *M = BB->getParent()->getParent();
-  Function *FnAssume = Intrinsic::getDeclaration(M, Intrinsic::assume);
+  Function *FnAssume = Intrinsic::getOrInsertDeclaration(M, Intrinsic::assume);
   return CreateCall(FnAssume, Ops, OpBundles);
 }
 
 Instruction *IRBuilderBase::CreateNoAliasScopeDeclaration(Value *Scope) {
   Module *M = BB->getModule();
-  auto *FnIntrinsic = Intrinsic::getDeclaration(
+  auto *FnIntrinsic = Intrinsic::getOrInsertDeclaration(
       M, Intrinsic::experimental_noalias_scope_decl, {});
   return CreateCall(FnIntrinsic, {Scope});
 }
@@ -615,7 +617,7 @@ CallInst *IRBuilderBase::CreateMaskedIntrinsic(Intrinsic::ID Id,
                                                ArrayRef<Type *> OverloadedTypes,
                                                const Twine &Name) {
   Module *M = BB->getParent()->getParent();
-  Function *TheFn = Intrinsic::getDeclaration(M, Id, OverloadedTypes);
+  Function *TheFn = Intrinsic::getOrInsertDeclaration(M, Id, OverloadedTypes);
   return CreateCall(TheFn, Ops, {}, Name);
 }
 
@@ -765,9 +767,9 @@ static CallInst *CreateGCStatepointCallCommon(
     const Twine &Name) {
   Module *M = Builder->GetInsertBlock()->getParent()->getParent();
   // Fill in the one generic type'd argument (the function is also vararg)
-  Function *FnStatepoint =
-      Intrinsic::getDeclaration(M, Intrinsic::experimental_gc_statepoint,
-                                {ActualCallee.getCallee()->getType()});
+  Function *FnStatepoint = Intrinsic::getOrInsertDeclaration(
+      M, Intrinsic::experimental_gc_statepoint,
+      {ActualCallee.getCallee()->getType()});
 
   std::vector<Value *> Args = getStatepointArgs(
       *Builder, ID, NumPatchBytes, ActualCallee.getCallee(), Flags, CallArgs);
@@ -820,9 +822,9 @@ static InvokeInst *CreateGCStatepointInvokeCommon(
     const Twine &Name) {
   Module *M = Builder->GetInsertBlock()->getParent()->getParent();
   // Fill in the one generic type'd argument (the function is also vararg)
-  Function *FnStatepoint =
-      Intrinsic::getDeclaration(M, Intrinsic::experimental_gc_statepoint,
-                                {ActualInvokee.getCallee()->getType()});
+  Function *FnStatepoint = Intrinsic::getOrInsertDeclaration(
+      M, Intrinsic::experimental_gc_statepoint,
+      {ActualInvokee.getCallee()->getType()});
 
   std::vector<Value *> Args =
       getStatepointArgs(*Builder, ID, NumPatchBytes, ActualInvokee.getCallee(),
@@ -875,7 +877,7 @@ CallInst *IRBuilderBase::CreateGCResult(Instruction *Statepoint,
   Intrinsic::ID ID = Intrinsic::experimental_gc_result;
   Module *M = BB->getParent()->getParent();
   Type *Types[] = {ResultType};
-  Function *FnGCResult = Intrinsic::getDeclaration(M, ID, Types);
+  Function *FnGCResult = Intrinsic::getOrInsertDeclaration(M, ID, Types);
 
   Value *Args[] = {Statepoint};
   return CreateCall(FnGCResult, Args, {}, Name);
@@ -886,8 +888,8 @@ CallInst *IRBuilderBase::CreateGCRelocate(Instruction *Statepoint,
                                           Type *ResultType, const Twine &Name) {
   Module *M = BB->getParent()->getParent();
   Type *Types[] = {ResultType};
-  Function *FnGCRelocate =
-      Intrinsic::getDeclaration(M, Intrinsic::experimental_gc_relocate, Types);
+  Function *FnGCRelocate = Intrinsic::getOrInsertDeclaration(
+      M, Intrinsic::experimental_gc_relocate, Types);
 
   Value *Args[] = {Statepoint, getInt32(BaseOffset), getInt32(DerivedOffset)};
   return CreateCall(FnGCRelocate, Args, {}, Name);
@@ -897,7 +899,7 @@ CallInst *IRBuilderBase::CreateGCGetPointerBase(Value *DerivedPtr,
                                                 const Twine &Name) {
   Module *M = BB->getParent()->getParent();
   Type *PtrTy = DerivedPtr->getType();
-  Function *FnGCFindBase = Intrinsic::getDeclaration(
+  Function *FnGCFindBase = Intrinsic::getOrInsertDeclaration(
       M, Intrinsic::experimental_gc_get_pointer_base, {PtrTy, PtrTy});
   return CreateCall(FnGCFindBase, {DerivedPtr}, {}, Name);
 }
@@ -906,7 +908,7 @@ CallInst *IRBuilderBase::CreateGCGetPointerOffset(Value *DerivedPtr,
                                                   const Twine &Name) {
   Module *M = BB->getParent()->getParent();
   Type *PtrTy = DerivedPtr->getType();
-  Function *FnGCGetOffset = Intrinsic::getDeclaration(
+  Function *FnGCGetOffset = Intrinsic::getOrInsertDeclaration(
       M, Intrinsic::experimental_gc_get_pointer_offset, {PtrTy});
   return CreateCall(FnGCGetOffset, {DerivedPtr}, {}, Name);
 }
@@ -915,7 +917,7 @@ CallInst *IRBuilderBase::CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V,
                                               Instruction *FMFSource,
                                               const Twine &Name) {
   Module *M = BB->getModule();
-  Function *Fn = Intrinsic::getDeclaration(M, ID, {V->getType()});
+  Function *Fn = Intrinsic::getOrInsertDeclaration(M, ID, {V->getType()});
   return createCallHelper(Fn, {V}, Name, FMFSource);
 }
 
@@ -923,7 +925,7 @@ Value *IRBuilderBase::CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS,
                                             Value *RHS, Instruction *FMFSource,
                                             const Twine &Name) {
   Module *M = BB->getModule();
-  Function *Fn = Intrinsic::getDeclaration(M, ID, { LHS->getType() });
+  Function *Fn = Intrinsic::getOrInsertDeclaration(M, ID, {LHS->getType()});
   if (Value *V = Folder.FoldBinaryIntrinsic(ID, LHS, RHS, Fn->getReturnType(),
                                             FMFSource))
     return V;
@@ -936,7 +938,7 @@ CallInst *IRBuilderBase::CreateIntrinsic(Intrinsic::ID ID,
                                          Instruction *FMFSource,
                                          const Twine &Name) {
   Module *M = BB->getModule();
-  Function *Fn = Intrinsic::getDeclaration(M, ID, Types);
+  Function *Fn = Intrinsic::getOrInsertDeclaration(M, ID, Types);
   return createCallHelper(Fn, Args, Name, FMFSource);
 }
 
@@ -963,7 +965,7 @@ CallInst *IRBuilderBase::CreateIntrinsic(Type *RetTy, Intrinsic::ID ID,
          "Wrong types for intrinsic!");
   // TODO: Handle varargs intrinsics.
 
-  Function *Fn = Intrinsic::getDeclaration(M, ID, OverloadTys);
+  Function *Fn = Intrinsic::getOrInsertDeclaration(M, ID, OverloadTys);
   return createCallHelper(Fn, Args, Name, FMFSource);
 }
 
@@ -1120,7 +1122,7 @@ Value *IRBuilderBase::CreateLaunderInvariantGroup(Value *Ptr) {
          "launder.invariant.group only applies to pointers.");
   auto *PtrType = Ptr->getType();
   Module *M = BB->getParent()->getParent();
-  Function *FnLaunderInvariantGroup = Intrinsic::getDeclaration(
+  Function *FnLaunderInvariantGroup = Intrinsic::getOrInsertDeclaration(
       M, Intrinsic::launder_invariant_group, {PtrType});
 
   assert(FnLaunderInvariantGroup->getReturnType() == PtrType &&
@@ -1137,7 +1139,7 @@ Value *IRBuilderBase::CreateStripInvariantGroup(Value *Ptr) {
 
   auto *PtrType = Ptr->getType();
   Module *M = BB->getParent()->getParent();
-  Function *FnStripInvariantGroup = Intrinsic::getDeclaration(
+  Function *FnStripInvariantGroup = Intrinsic::getOrInsertDeclaration(
       M, Intrinsic::strip_invariant_group, {PtrType});
 
   assert(FnStripInvariantGroup->getReturnType() == PtrType &&
@@ -1152,7 +1154,8 @@ Value *IRBuilderBase::CreateVectorReverse(Value *V, const Twine &Name) {
   auto *Ty = cast<VectorType>(V->getType());
   if (isa<ScalableVectorType>(Ty)) {
     Module *M = BB->getParent()->getParent();
-    Function *F = Intrinsic::getDeclaration(M, Intrinsic::vector_reverse, Ty);
+    Function *F =
+        Intrinsic::getOrInsertDeclaration(M, Intrinsic::vector_reverse, Ty);
     return Insert(CallInst::Create(F, V), Name);
   }
   // Keep the original behaviour for fixed vector
@@ -1171,7 +1174,8 @@ Value *IRBuilderBase::CreateVectorSplice(Value *V1, Value *V2, int64_t Imm,
 
   if (auto *VTy = dyn_cast<ScalableVectorType>(V1->getType())) {
     Module *M = BB->getParent()->getParent();
-    Function *F = Intrinsic::getDeclaration(M, Intrinsic::vector_splice, VTy);
+    Function *F =
+        Intrinsic::getOrInsertDeclaration(M, Intrinsic::vector_splice, VTy);
 
     Value *Ops[] = {V1, V2, getInt32(Imm)};
     return Insert(CallInst::Create(F, Ops), Name);
@@ -1225,7 +1229,7 @@ Value *IRBuilderBase::CreatePreserveArrayAccessIndex(
   Type *ResultType = GetElementPtrInst::getGEPReturnType(Base, IdxList);
 
   Module *M = BB->getParent()->getParent();
-  Function *FnPreserveArrayAccessIndex = Intrinsic::getDeclaration(
+  Function *FnPreserveArrayAccessIndex = Intrinsic::getOrInsertDeclaration(
       M, Intrinsic::preserve_array_access_index, {ResultType, BaseType});
 
   Value *DimV = getInt32(Dimension);
@@ -1246,7 +1250,7 @@ Value *IRBuilderBase::CreatePreserveUnionAccessIndex(
   auto *BaseType = Base->getType();
 
   Module *M = BB->getParent()->getParent();
-  Function *FnPreserveUnionAccessIndex = Intrinsic::getDeclaration(
+  Function *FnPreserveUnionAccessIndex = Intrinsic::getOrInsertDeclaration(
       M, Intrinsic::preserve_union_access_index, {BaseType, BaseType});
 
   Value *DIIndex = getInt32(FieldIndex);
@@ -1271,7 +1275,7 @@ Value *IRBuilderBase::CreatePreserveStructAccessIndex(
       GetElementPtrInst::getGEPReturnType(Base, {Zero, GEPIndex});
 
   Module *M = BB->getParent()->getParent();
-  Function *FnPreserveStructAccessIndex = Intrinsic::getDeclaration(
+  Function *FnPreserveStructAccessIndex = Intrinsic::getOrInsertDeclaration(
       M, Intrinsic::preserve_struct_access_index, {ResultType, BaseType});
 
   Value *DIIndex = getInt32(FieldIndex);
@@ -1288,8 +1292,8 @@ Value *IRBuilderBase::CreatePreserveStructAccessIndex(
 Value *IRBuilderBase::createIsFPClass(Value *FPNum, unsigned Test) {
   ConstantInt *TestV = getInt32(Test);
   Module *M = BB->getParent()->getParent();
-  Function *FnIsFPClass =
-      Intrinsic::getDeclaration(M, Intrinsic::is_fpclass, {FPNum->getType()});
+  Function *FnIsFPClass = Intrinsic::getOrInsertDeclaration(
+      M, Intrinsic::is_fpclass, {FPNum->getType()});
   return CreateCall(FnIsFPClass, {FPNum, TestV});
 }
 
diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp
index 0a6c93fde6302f..002bab8e079e50 100644
--- a/llvm/lib/IR/IntrinsicInst.cpp
+++ b/llvm/lib/IR/IntrinsicInst.cpp
@@ -629,9 +629,8 @@ bool VPIntrinsic::canIgnoreVectorLengthParam() const {
   return false;
 }
 
-Function *VPIntrinsic::getDeclarationForParams(Module *M, Intrinsic::ID VPID,
-                                               Type *ReturnType,
-                                               ArrayRef<Value *> Params) {
+Function *VPIntrinsic::getOrInsertDeclarationForParams(
+    Module *M, Intrinsic::ID VPID, Type *ReturnType, ArrayRef<Value *> Params) {
   assert(isVPIntrinsic(VPID) && "not a VP intrinsic");
   Function *VPFunc;
   switch (VPID) {
@@ -641,7 +640,7 @@ Function *VPIntrinsic::getDeclarationForParams(Module *M, Intrinsic::ID VPID,
       OverloadTy =
           Params[*VPReductionIntrinsic::getVectorParamPos(VPID)]->getType();
 
-    VPFunc = Intrinsic::getDeclaration(M, VPID, OverloadTy);
+    VPFunc = Intrinsic::getOrInsertDeclaration(M, VPID, OverloadTy);
     break;
   }
   case Intrinsic::vp_trunc:
@@ -658,43 +657,43 @@ Function *VPIntrinsic::getDeclarationForParams(Module *M, Intrinsic::ID VPID,
   case Intrinsic::vp_lrint:
   case Intrinsic::vp_llrint:
   case Intrinsic::vp_cttz_elts:
-    VPFunc =
-        Intrinsic::getDeclaration(M, VPID, {ReturnType, Params[0]->getType()});
+    VPFunc = Intrinsic::getOrInsertDeclaration(
+        M, VPID, {ReturnType, Params[0]->getType()});
     break;
   case Intrinsic::vp_is_fpclass:
-    VPFunc = Intrinsic::getDeclaration(M, VPID, {Params[0]->getType()});
+    VPFunc = Intrinsic::getOrInsertDeclaration(M, VPID, {Params[0]->getType()});
     break;
   case Intrinsic::vp_merge:
   case Intrinsic::vp_select:
-    VPFunc = Intrinsic::getDeclaration(M, VPID, {Params[1]->getType()});
+    VPFunc = Intrinsic::getOrInsertDeclaration(M, VPID, {Params[1]->getType()});
     break;
   case Intrinsic::vp_load:
-    VPFunc = Intrinsic::getDeclaration(
+    VPFunc = Intrinsic::getOrInsertDeclaration(
         M, VPID, {ReturnType, Params[0]->getType()});
     break;
   case Intrinsic::experimental_vp_strided_load:
-    VPFunc = Intrinsic::getDeclaration(
+    VPFunc = Intrinsic::getOrInsertDeclaration(
         M, VPID, {ReturnType, Params[0]->getType(), Params[1]->getType()});
     break;
   case Intrinsic::vp_gather:
-    VPFunc = Intrinsic::getDeclaration(
+    VPFunc = Intrinsic::getOrInsertDeclaration(
         M, VPID, {ReturnType, Params[0]->getType()});
     break;
   case Intrinsic::vp_store:
-    VPFunc = Intrinsic::getDeclaration(
+    VPFunc = Intrinsic::getOrInsertDeclaration(
         M, VPID, {Params[0]->getType(), Params[1]->getType()});
     break;
   case Intrinsic::experimental_vp_strided_store:
-    VPFunc = Intrinsic::getDeclaration(
+    VPFunc = Intrinsic::getOrInsertDeclaration(
         M, VPID,
         {Params[0]->getType(), Params[1]->getType(), Params[2]->getType()});
     break;
   case Intrinsic::vp_scatter:
-    VPFunc = Intrinsic::getDeclaration(
+    VPFunc = Intrinsic::getOrInsertDeclaration(
         M, VPID, {Params[0]->getType(), Params[1]->getType()});
     break;
   case Intrinsic::experimental_vp_splat:
-    VPFunc = Intrinsic::getDeclaration(M, VPID, ReturnType);
+    VPFunc = Intrinsic::getOrInsertDeclaration(M, VPID, ReturnType);
     break;
   }
   assert(VPFunc && "Could not declare VP intrinsic");
diff --git a/llvm/lib/IR/Intrinsics.cpp b/llvm/lib/IR/Intrinsics.cpp
index ef26b1926b9767..ff8b4b7a020c2f 100644
--- a/llvm/lib/IR/Intrinsics.cpp
+++ b/llvm/lib/IR/Intrinsics.cpp
@@ -713,7 +713,8 @@ Intrinsic::ID Intrinsic::lookupIntrinsicID(StringRef Name) {
 #include "llvm/IR/IntrinsicImpl.inc"
 #undef GET_INTRINSIC_ATTRIBUTES
 
-Function *Intrinsic::getDeclaration(Module *M, ID id, ArrayRef<Type *> Tys) {
+Function *Intrinsic::getOrInsertDeclaration(Module *M, ID id,
+                                            ArrayRef<Type *> Tys) {
   // There can never be multiple globals with the same name of different types,
   // because intrinsics must be a specific type.
   auto *FT = getType(M->getContext(), id, Tys);
@@ -1078,7 +1079,7 @@ std::optional<Function *> Intrinsic::remangleIntrinsicFunction(Function *F) {
       // invalid and we'll get an error.
       ExistingGV->setName(WantedName + ".renamed");
     }
-    return Intrinsic::getDeclaration(F->getParent(), ID, ArgTys);
+    return Intrinsic::getOrInsertDeclaration(F->getParent(), ID, ArgTys);
   }();
 
   NewDecl->setCallingConv(F->getCallingConv());
diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp
index 704bc8d339bc57..ab48d3e4101b72 100644
--- a/llvm/lib/IR/Module.cpp
+++ b/llvm/lib/IR/Module.cpp
@@ -89,21 +89,22 @@ Module::~Module() {
 
 void Module::removeDebugIntrinsicDeclarations() {
   auto *DeclareIntrinsicFn =
-      Intrinsic::getDeclaration(this, Intrinsic::dbg_declare);
+      Intrinsic::getOrInsertDeclaration(this, Intrinsic::dbg_declare);
   assert((!isMaterialized() || DeclareIntrinsicFn->hasZeroLiveUses()) &&
          "Debug declare intrinsic should have had uses removed.");
   DeclareIntrinsicFn->eraseFromParent();
   auto *ValueIntrinsicFn =
-      Intrinsic::getDeclaration(this, Intrinsic::dbg_value);
+      Intrinsic::getOrInsertDeclaration(this, Intrinsic::dbg_value);
   assert((!isMaterialized() || ValueIntrinsicFn->hasZeroLiveUses()) &&
          "Debug value intrinsic should have had uses removed.");
   ValueIntrinsicFn->eraseFromParent();
   auto *AssignIntrinsicFn =
-      Intrinsic::getDeclaration(this, Intrinsic::dbg_assign);
+      Intrinsic::getOrInsertDeclaration(this, Intrinsic::dbg_assign);
   assert((!isMaterialized() || AssignIntrinsicFn->hasZeroLiveUses()) &&
          "Debug assign intrinsic should have had uses removed.");
   AssignIntrinsicFn->eraseFromParent();
-  auto *LabelntrinsicFn = Intrinsic::getDeclaration(this, Intrinsic::dbg_label);
+  auto *LabelntrinsicFn =
+      Intrinsic::getOrInsertDeclaration(this, Intrinsic::dbg_label);
   assert((!isMaterialized() || LabelntrinsicFn->hasZeroLiveUses()) &&
          "Debug label intrinsic should have had uses removed.");
   LabelntrinsicFn->eraseFromParent();
diff --git a/llvm/lib/IR/VectorBuilder.cpp b/llvm/lib/IR/VectorBuilder.cpp
index f42948ba89042f..737f49b1334d76 100644
--- a/llvm/lib/IR/VectorBuilder.cpp
+++ b/llvm/lib/IR/VectorBuilder.cpp
@@ -108,8 +108,8 @@ Value *VectorBuilder::createVectorInstructionImpl(Intrinsic::ID VPID,
   if (VLenPosOpt)
     IntrinParams[*VLenPosOpt] = &requestEVL();
 
-  auto *VPDecl = VPIntrinsic::getDeclarationForParams(&getModule(), VPID,
-                                                      ReturnTy, IntrinParams);
+  auto *VPDecl = VPIntrinsic::getOrInsertDeclarationForParams(
+      &getModule(), VPID, ReturnTy, IntrinParams);
   return Builder.CreateCall(VPDecl, IntrinParams, Name);
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8a217cd1ec5cf9..ae96e277b5fc69 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16454,8 +16454,8 @@ static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
         Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy));
 
     if (Parts.size() == 4) {
-      auto *F = Intrinsic::getDeclaration(TI->getModule(),
-                                          Intrinsic::aarch64_neon_tbl4, VecTy);
+      auto *F = Intrinsic::getOrInsertDeclaration(
+          TI->getModule(), Intrinsic::aarch64_neon_tbl4, VecTy);
       Parts.push_back(ConstantVector::get(MaskConst));
       Results.push_back(Builder.CreateCall(F, Parts));
       Parts.clear();
@@ -16484,7 +16484,7 @@ static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
       break;
     }
 
-    auto *F = Intrinsic::getDeclaration(TI->getModule(), TblID, VecTy);
+    auto *F = Intrinsic::getOrInsertDeclaration(TI->getModule(), TblID, VecTy);
     Parts.push_back(ConstantVector::get(MaskConst));
     Results.push_back(Builder.CreateCall(F, Parts));
   }
@@ -16765,9 +16765,10 @@ static Function *getStructuredLoadFunction(Module *M, unsigned Factor,
                                              Intrinsic::aarch64_neon_ld3,
                                              Intrinsic::aarch64_neon_ld4};
   if (Scalable)
-    return Intrinsic::getDeclaration(M, SVELoads[Factor - 2], {LDVTy});
+    return Intrinsic::getOrInsertDeclaration(M, SVELoads[Factor - 2], {LDVTy});
 
-  return Intrinsic::getDeclaration(M, NEONLoads[Factor - 2], {LDVTy, PtrTy});
+  return Intrinsic::getOrInsertDeclaration(M, NEONLoads[Factor - 2],
+                                           {LDVTy, PtrTy});
 }
 
 static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
@@ -16781,9 +16782,10 @@ static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
                                               Intrinsic::aarch64_neon_st3,
                                               Intrinsic::aarch64_neon_st4};
   if (Scalable)
-    return Intrinsic::getDeclaration(M, SVEStores[Factor - 2], {STVTy});
+    return Intrinsic::getOrInsertDeclaration(M, SVEStores[Factor - 2], {STVTy});
 
-  return Intrinsic::getDeclaration(M, NEONStores[Factor - 2], {STVTy, PtrTy});
+  return Intrinsic::getOrInsertDeclaration(M, NEONStores[Factor - 2],
+                                           {STVTy, PtrTy});
 }
 
 /// Lower an interleaved load into a ldN intrinsic.
@@ -27247,7 +27249,7 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
   if (ValueTy->getPrimitiveSizeInBits() == 128) {
     Intrinsic::ID Int =
         IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
-    Function *Ldxr = Intrinsic::getDeclaration(M, Int);
+    Function *Ldxr = Intrinsic::getOrInsertDeclaration(M, Int);
 
     Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
 
@@ -27266,7 +27268,7 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
   Type *Tys[] = { Addr->getType() };
   Intrinsic::ID Int =
       IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
-  Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys);
+  Function *Ldxr = Intrinsic::getOrInsertDeclaration(M, Int, Tys);
 
   const DataLayout &DL = M->getDataLayout();
   IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
@@ -27281,7 +27283,8 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
 void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
     IRBuilderBase &Builder) const {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
+  Builder.CreateCall(
+      Intrinsic::getOrInsertDeclaration(M, Intrinsic::aarch64_clrex));
 }
 
 Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder,
@@ -27296,7 +27299,7 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder,
   if (Val->getType()->getPrimitiveSizeInBits() == 128) {
     Intrinsic::ID Int =
         IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
-    Function *Stxr = Intrinsic::getDeclaration(M, Int);
+    Function *Stxr = Intrinsic::getOrInsertDeclaration(M, Int);
     Type *Int64Ty = Type::getInt64Ty(M->getContext());
     Type *Int128Ty = Type::getInt128Ty(M->getContext());
 
@@ -27311,7 +27314,7 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder,
   Intrinsic::ID Int =
       IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
   Type *Tys[] = { Addr->getType() };
-  Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
+  Function *Stxr = Intrinsic::getOrInsertDeclaration(M, Int, Tys);
 
   const DataLayout &DL = M->getDataLayout();
   IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
@@ -27348,7 +27351,7 @@ bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
 static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) {
   Module *M = IRB.GetInsertBlock()->getParent()->getParent();
   Function *ThreadPointerFunc =
-      Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
+      Intrinsic::getOrInsertDeclaration(M, Intrinsic::thread_pointer);
   return IRB.CreatePointerCast(
       IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
                              Offset),
diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
index e62437c28b863f..fe96fedcfb82dc 100644
--- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
@@ -436,10 +436,10 @@ Instruction *AArch64StackTagging::collectInitializers(Instruction *StartInst,
 
 void AArch64StackTagging::tagAlloca(AllocaInst *AI, Instruction *InsertBefore,
                                     Value *Ptr, uint64_t Size) {
-  auto SetTagZeroFunc =
-      Intrinsic::getDeclaration(F->getParent(), Intrinsic::aarch64_settag_zero);
-  auto StgpFunc =
-      Intrinsic::getDeclaration(F->getParent(), Intrinsic::aarch64_stgp);
+  auto SetTagZeroFunc = Intrinsic::getOrInsertDeclaration(
+      F->getParent(), Intrinsic::aarch64_settag_zero);
+  auto StgpFunc = Intrinsic::getOrInsertDeclaration(F->getParent(),
+                                                    Intrinsic::aarch64_stgp);
 
   InitializerBuilder IB(Size, DL, Ptr, SetTagFunc, SetTagZeroFunc, StgpFunc);
   bool LittleEndian =
@@ -481,8 +481,8 @@ Instruction *AArch64StackTagging::insertBaseTaggedPointer(
   assert(PrologueBB);
 
   IRBuilder<> IRB(&PrologueBB->front());
-  Function *IRG_SP =
-      Intrinsic::getDeclaration(F->getParent(), Intrinsic::aarch64_irg_sp);
+  Function *IRG_SP = Intrinsic::getOrInsertDeclaration(
+      F->getParent(), Intrinsic::aarch64_irg_sp);
   Instruction *Base =
       IRB.CreateCall(IRG_SP, {Constant::getNullValue(IRB.getInt64Ty())});
   Base->setName("basetag");
@@ -563,8 +563,8 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
     LI = DeleteLI.get();
   }
 
-  SetTagFunc =
-      Intrinsic::getDeclaration(F->getParent(), Intrinsic::aarch64_settag);
+  SetTagFunc = Intrinsic::getOrInsertDeclaration(F->getParent(),
+                                                 Intrinsic::aarch64_settag);
 
   Instruction *Base =
       insertBaseTaggedPointer(*Fn.getParent(), SInfo.AllocasToInstrument, DT);
@@ -580,7 +580,7 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
     NextTag = (NextTag + 1) % 16;
     // Replace alloca with tagp(alloca).
     IRBuilder<> IRB(Info.AI->getNextNode());
-    Function *TagP = Intrinsic::getDeclaration(
+    Function *TagP = Intrinsic::getOrInsertDeclaration(
         F->getParent(), Intrinsic::aarch64_tagp, {Info.AI->getType()});
     Instruction *TagPCall =
         IRB.CreateCall(TagP, {Constant::getNullValue(Info.AI->getType()), Base,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 7b74bb2a03a642..91ab3fcfc4c70e 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1637,7 +1637,7 @@ static std::optional<Instruction *> instCombineSVEAllActive(IntrinsicInst &II,
     return std::nullopt;
 
   auto *Mod = II.getModule();
-  auto *NewDecl = Intrinsic::getDeclaration(Mod, IID, {II.getType()});
+  auto *NewDecl = Intrinsic::getOrInsertDeclaration(Mod, IID, {II.getType()});
   II.setCalledFunction(NewDecl);
 
   return &II;
diff --git a/llvm/lib/Target/AArch64/SMEABIPass.cpp b/llvm/lib/Target/AArch64/SMEABIPass.cpp
index 174d95333d918d..2ee16a873e33b8 100644
--- a/llvm/lib/Target/AArch64/SMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/SMEABIPass.cpp
@@ -71,7 +71,7 @@ void emitTPIDR2Save(Module *M, IRBuilder<> &Builder) {
 
   // A save to TPIDR2 should be followed by clearing TPIDR2_EL0.
   Function *WriteIntr =
-      Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_set_tpidr2);
+      Intrinsic::getOrInsertDeclaration(M, Intrinsic::aarch64_sme_set_tpidr2);
   Builder.CreateCall(WriteIntr->getFunctionType(), WriteIntr,
                      Builder.getInt64(0));
 }
@@ -114,7 +114,7 @@ bool SMEABI::updateNewStateFunctions(Module *M, Function *F,
     // Read TPIDR2_EL0 in PreludeBB & branch to SaveBB if not 0.
     Builder.SetInsertPoint(PreludeBB);
     Function *TPIDR2Intr =
-        Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_get_tpidr2);
+        Intrinsic::getOrInsertDeclaration(M, Intrinsic::aarch64_sme_get_tpidr2);
     auto *TPIDR2 = Builder.CreateCall(TPIDR2Intr->getFunctionType(), TPIDR2Intr,
                                       {}, "tpidr2");
     auto *Cmp = Builder.CreateCmp(ICmpInst::ICMP_NE, TPIDR2,
@@ -128,20 +128,20 @@ bool SMEABI::updateNewStateFunctions(Module *M, Function *F,
     // Enable pstate.za at the start of the function.
     Builder.SetInsertPoint(&OrigBB->front());
     Function *EnableZAIntr =
-        Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_za_enable);
+        Intrinsic::getOrInsertDeclaration(M, Intrinsic::aarch64_sme_za_enable);
     Builder.CreateCall(EnableZAIntr->getFunctionType(), EnableZAIntr);
   }
 
   if (FnAttrs.isNewZA()) {
     Function *ZeroIntr =
-        Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_zero);
+        Intrinsic::getOrInsertDeclaration(M, Intrinsic::aarch64_sme_zero);
     Builder.CreateCall(ZeroIntr->getFunctionType(), ZeroIntr,
                        Builder.getInt32(0xff));
   }
 
   if (FnAttrs.isNewZT0()) {
     Function *ClearZT0Intr =
-        Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_zero_zt);
+        Intrinsic::getOrInsertDeclaration(M, Intrinsic::aarch64_sme_zero_zt);
     Builder.CreateCall(ClearZT0Intr->getFunctionType(), ClearZT0Intr,
                        {Builder.getInt32(0)});
   }
@@ -153,8 +153,8 @@ bool SMEABI::updateNewStateFunctions(Module *M, Function *F,
       if (!T || !isa<ReturnInst>(T))
         continue;
       Builder.SetInsertPoint(T);
-      Function *DisableZAIntr =
-          Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_za_disable);
+      Function *DisableZAIntr = Intrinsic::getOrInsertDeclaration(
+          M, Intrinsic::aarch64_sme_za_disable);
       Builder.CreateCall(DisableZAIntr->getFunctionType(), DisableZAIntr);
     }
   }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index f408a013d7a379..ea88ed424dc597 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -407,8 +407,8 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
                                                  Value *const Identity) const {
   Type *AtomicTy = V->getType();
   Module *M = B.GetInsertBlock()->getModule();
-  Function *UpdateDPP =
-      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy);
+  Function *UpdateDPP = Intrinsic::getOrInsertDeclaration(
+      M, Intrinsic::amdgcn_update_dpp, AtomicTy);
 
   // Reduce within each row of 16 lanes.
   for (unsigned Idx = 0; Idx < 4; Idx++) {
@@ -439,8 +439,8 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
 
   // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
   // combine them with a scalar operation.
-  Function *ReadLane =
-      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, AtomicTy);
+  Function *ReadLane = Intrinsic::getOrInsertDeclaration(
+      M, Intrinsic::amdgcn_readlane, AtomicTy);
   Value *Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)});
   Value *Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)});
   return buildNonAtomicBinOp(B, Op, Lane0, Lane32);
@@ -453,8 +453,8 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
                                             Value *Identity) const {
   Type *AtomicTy = V->getType();
   Module *M = B.GetInsertBlock()->getModule();
-  Function *UpdateDPP =
-      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy);
+  Function *UpdateDPP = Intrinsic::getOrInsertDeclaration(
+      M, Intrinsic::amdgcn_update_dpp, AtomicTy);
 
   for (unsigned Idx = 0; Idx < 4; Idx++) {
     V = buildNonAtomicBinOp(
@@ -513,18 +513,18 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
                                                   Value *Identity) const {
   Type *AtomicTy = V->getType();
   Module *M = B.GetInsertBlock()->getModule();
-  Function *UpdateDPP =
-      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy);
+  Function *UpdateDPP = Intrinsic::getOrInsertDeclaration(
+      M, Intrinsic::amdgcn_update_dpp, AtomicTy);
   if (ST->hasDPPWavefrontShifts()) {
     // GFX9 has DPP wavefront shift operations.
     V = B.CreateCall(UpdateDPP,
                      {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
                       B.getInt32(0xf), B.getFalse()});
   } else {
-    Function *ReadLane =
-        Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, AtomicTy);
-    Function *WriteLane =
-        Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, AtomicTy);
+    Function *ReadLane = Intrinsic::getOrInsertDeclaration(
+        M, Intrinsic::amdgcn_readlane, AtomicTy);
+    Function *WriteLane = Intrinsic::getOrInsertDeclaration(
+        M, Intrinsic::amdgcn_writelane, AtomicTy);
 
     // On GFX10 all DPP operations are confined to a single row. To get cross-
     // row operations we have to use permlane or readlane.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 052e1140533f3f..7d3164c79089e0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -119,8 +119,8 @@ class AMDGPUCodeGenPrepareImpl
       return SqrtF32;
 
     LLVMContext &Ctx = Mod->getContext();
-    SqrtF32 = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_sqrt,
-                                        {Type::getFloatTy(Ctx)});
+    SqrtF32 = Intrinsic::getOrInsertDeclaration(Mod, Intrinsic::amdgcn_sqrt,
+                                                {Type::getFloatTy(Ctx)});
     return SqrtF32;
   }
 
@@ -129,7 +129,7 @@ class AMDGPUCodeGenPrepareImpl
       return LdexpF32;
 
     LLVMContext &Ctx = Mod->getContext();
-    LdexpF32 = Intrinsic::getDeclaration(
+    LdexpF32 = Intrinsic::getOrInsertDeclaration(
         Mod, Intrinsic::ldexp, {Type::getFloatTy(Ctx), Type::getInt32Ty(Ctx)});
     return LdexpF32;
   }
@@ -577,7 +577,7 @@ bool AMDGPUCodeGenPrepareImpl::promoteUniformBitreverseToI32(
 
   Type *I32Ty = getI32Ty(Builder, I.getType());
   Function *I32 =
-      Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty });
+      Intrinsic::getOrInsertDeclaration(Mod, Intrinsic::bitreverse, {I32Ty});
   Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);
   Value *ExtRes = Builder.CreateCall(I32, { ExtOp });
   Value *LShrOp =
@@ -1260,8 +1260,8 @@ Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(
   Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty)
                        : Builder.CreateUIToFP(IB,F32Ty);
 
-  Function *RcpDecl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp,
-                                                Builder.getFloatTy());
+  Function *RcpDecl = Intrinsic::getOrInsertDeclaration(
+      Mod, Intrinsic::amdgcn_rcp, Builder.getFloatTy());
   Value *RCP = Builder.CreateCall(RcpDecl, { FB });
   Value *FQM = Builder.CreateFMul(FA, RCP);
 
@@ -1455,7 +1455,8 @@ Value *AMDGPUCodeGenPrepareImpl::expandDivRem32(IRBuilder<> &Builder,
 
   // Initial estimate of inv(y).
   Value *FloatY = Builder.CreateUIToFP(Y, F32Ty);
-  Function *Rcp = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, F32Ty);
+  Function *Rcp =
+      Intrinsic::getOrInsertDeclaration(Mod, Intrinsic::amdgcn_rcp, F32Ty);
   Value *RcpY = Builder.CreateCall(Rcp, {FloatY});
   Constant *Scale = ConstantFP::get(F32Ty, llvm::bit_cast<float>(0x4F7FFFFE));
   Value *ScaledY = Builder.CreateFMul(RcpY, Scale);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp
index 45207c06a788a2..e48fed025857fa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp
@@ -237,7 +237,7 @@ bool optimizeSection(ArrayRef<SmallVector<IntrinsicInst *, 4>> MergeableInsts) {
       else
         NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2darraymsaa;
 
-      Function *NewIntrin = Intrinsic::getDeclaration(
+      Function *NewIntrin = Intrinsic::getOrInsertDeclaration(
           IIList.front()->getModule(), NewIntrinID, OverloadTys);
       Args[ImageDimIntr->DMaskIndex] =
           ConstantInt::get(DMask->getType(), NewMaskVal);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index ecb4d4fa5d5c39..6a5a48778197e4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -130,7 +130,8 @@ static std::optional<Instruction *> modifyIntrinsicCall(
   // Modify arguments and types
   Func(Args, ArgTys);
 
-  Function *I = Intrinsic::getDeclaration(OldIntr.getModule(), NewIntr, ArgTys);
+  Function *I =
+      Intrinsic::getOrInsertDeclaration(OldIntr.getModule(), NewIntr, ArgTys);
 
   CallInst *NewCall = IC.Builder.CreateCall(I, Args);
   NewCall->takeName(&OldIntr);
@@ -502,7 +503,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
       if (IID == Intrinsic::sqrt && !canContractSqrtToRsq(SqrtOp))
         break;
 
-      Function *NewDecl = Intrinsic::getDeclaration(
+      Function *NewDecl = Intrinsic::getOrInsertDeclaration(
           SrcCI->getModule(), Intrinsic::amdgcn_rsq, {SrcCI->getType()});
 
       InnerFMF |= FMF;
@@ -527,7 +528,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
 
     // f16 amdgcn.sqrt is identical to regular sqrt.
     if (IID == Intrinsic::amdgcn_sqrt && Src->getType()->isHalfTy()) {
-      Function *NewDecl = Intrinsic::getDeclaration(
+      Function *NewDecl = Intrinsic::getOrInsertDeclaration(
           II.getModule(), Intrinsic::sqrt, {II.getType()});
       II.setCalledFunction(NewDecl);
       return &II;
@@ -614,7 +615,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
     Value *Src1 = II.getArgOperand(1);
     const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
     if (CMask) {
-      II.setCalledOperand(Intrinsic::getDeclaration(
+      II.setCalledOperand(Intrinsic::getOrInsertDeclaration(
           II.getModule(), Intrinsic::is_fpclass, Src0->getType()));
 
       // Clamp any excess bits, as they're illegal for the generic intrinsic.
@@ -890,7 +891,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
         // register (which contains the bitmask of live threads). So a
         // comparison that always returns true is the same as a read of the
         // EXEC register.
-        Function *NewF = Intrinsic::getDeclaration(
+        Function *NewF = Intrinsic::getOrInsertDeclaration(
             II.getModule(), Intrinsic::read_register, II.getType());
         Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
         MDNode *MD = MDNode::get(II.getContext(), MDArgs);
@@ -989,7 +990,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
       } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
         break;
 
-      Function *NewF = Intrinsic::getDeclaration(
+      Function *NewF = Intrinsic::getOrInsertDeclaration(
           II.getModule(), NewIID, {II.getType(), SrcLHS->getType()});
       Value *Args[] = {SrcLHS, SrcRHS,
                        ConstantInt::get(CC->getType(), SrcPred)};
@@ -1205,7 +1206,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
     // If we can prove we don't have one of the special cases then we can use a
     // normal fma instead.
     if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {
-      II.setCalledOperand(Intrinsic::getDeclaration(
+      II.setCalledOperand(Intrinsic::getOrInsertDeclaration(
           II.getModule(), Intrinsic::fma, II.getType()));
       return &II;
     }
@@ -1401,7 +1402,7 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
       Args[0] = IC.Builder.CreateShuffleVector(II.getOperand(0), EltMask);
   }
 
-  Function *NewIntrin = Intrinsic::getDeclaration(
+  Function *NewIntrin = Intrinsic::getOrInsertDeclaration(
       II.getModule(), II.getIntrinsicID(), OverloadTys);
   CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args);
   NewCall->takeName(&II);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 53628981e12409..800bdbe04cf70d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1555,8 +1555,8 @@ bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
     MIB.addImm(MFI->getLDSSize());
   } else {
     Module *M = MF->getFunction().getParent();
-    const GlobalValue *GV
-      = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
+    const GlobalValue *GV =
+        Intrinsic::getOrInsertDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
     MIB.addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
   }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index e01c9dc66a3f1f..eb553ae4eb80ff 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -753,7 +753,7 @@ bool AMDGPULibCalls::fold(CallInst *CI) {
         CI->setArgOperand(1, SplatArg1);
       }
 
-      CI->setCalledFunction(Intrinsic::getDeclaration(
+      CI->setCalledFunction(Intrinsic::getOrInsertDeclaration(
           CI->getModule(), Intrinsic::ldexp,
           {CI->getType(), CI->getArgOperand(1)->getType()}));
       return true;
@@ -1034,7 +1034,8 @@ bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B,
   // pown/pow ---> powr(fabs(x), y) | (x & ((int)y << 31))
   FunctionCallee ExpExpr;
   if (ShouldUseIntrinsic)
-    ExpExpr = Intrinsic::getDeclaration(M, Intrinsic::exp2, {FPOp->getType()});
+    ExpExpr = Intrinsic::getOrInsertDeclaration(M, Intrinsic::exp2,
+                                                {FPOp->getType()});
   else {
     ExpExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_EXP2, FInfo));
     if (!ExpExpr)
@@ -1108,8 +1109,8 @@ bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B,
   if (needlog) {
     FunctionCallee LogExpr;
     if (ShouldUseIntrinsic) {
-      LogExpr =
-          Intrinsic::getDeclaration(M, Intrinsic::log2, {FPOp->getType()});
+      LogExpr = Intrinsic::getOrInsertDeclaration(M, Intrinsic::log2,
+                                                  {FPOp->getType()});
     } else {
       LogExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_LOG2, FInfo));
       if (!LogExpr)
@@ -1298,8 +1299,8 @@ void AMDGPULibCalls::replaceLibCallWithSimpleIntrinsic(IRBuilder<> &B,
     }
   }
 
-  CI->setCalledFunction(
-      Intrinsic::getDeclaration(CI->getModule(), IntrID, {CI->getType()}));
+  CI->setCalledFunction(Intrinsic::getOrInsertDeclaration(
+      CI->getModule(), IntrID, {CI->getType()}));
 }
 
 bool AMDGPULibCalls::tryReplaceLibcallWithSimpleIntrinsic(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index 51a5b7702c0093..ff5eb81490106f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -285,8 +285,8 @@ class AMDGPULowerModuleLDS {
     BasicBlock *Entry = &Func->getEntryBlock();
     IRBuilder<> Builder(Entry, Entry->getFirstNonPHIIt());
 
-    Function *Decl =
-        Intrinsic::getDeclaration(Func->getParent(), Intrinsic::donothing, {});
+    Function *Decl = Intrinsic::getOrInsertDeclaration(
+        Func->getParent(), Intrinsic::donothing, {});
 
     Value *UseInstance[1] = {
         Builder.CreateConstInBoundsGEP1_32(SGV->getValueType(), SGV, 0)};
@@ -529,8 +529,8 @@ class AMDGPULowerModuleLDS {
     // block to spare deduplicating it later.
     auto [It, Inserted] = tableKernelIndexCache.try_emplace(F);
     if (Inserted) {
-      Function *Decl =
-          Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_lds_kernel_id, {});
+      Function *Decl = Intrinsic::getOrInsertDeclaration(
+          &M, Intrinsic::amdgcn_lds_kernel_id, {});
 
       auto InsertAt = F->getEntryBlock().getFirstNonPHIOrDbgOrAlloca();
       IRBuilder<> Builder(&*InsertAt);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 24bfbff41ec5c0..63da3443479be3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -973,10 +973,10 @@ AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
 
   if (!IsAMDHSA) {
-    Function *LocalSizeYFn =
-        Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
-    Function *LocalSizeZFn =
-        Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z);
+    Function *LocalSizeYFn = Intrinsic::getOrInsertDeclaration(
+        Mod, Intrinsic::r600_read_local_size_y);
+    Function *LocalSizeZFn = Intrinsic::getOrInsertDeclaration(
+        Mod, Intrinsic::r600_read_local_size_z);
 
     CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
     CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
@@ -1022,7 +1022,7 @@ AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
   //   } hsa_kernel_dispatch_packet_t
   //
   Function *DispatchPtrFn =
-      Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
+      Intrinsic::getOrInsertDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
 
   CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
   DispatchPtr->addRetAttr(Attribute::NoAlias);
@@ -1082,7 +1082,7 @@ Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder,
     llvm_unreachable("invalid dimension");
   }
 
-  Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
+  Function *WorkitemIdFn = Intrinsic::getOrInsertDeclaration(Mod, IntrID);
   CallInst *CI = Builder.CreateCall(WorkitemIdFn);
   ST.makeLIDRangeMetadata(CI);
   F->removeFnAttr(AttrName);
@@ -1564,7 +1564,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
       continue;
     case Intrinsic::objectsize: {
       Value *Src = Intr->getOperand(0);
-      Function *ObjectSize = Intrinsic::getDeclaration(
+      Function *ObjectSize = Intrinsic::getOrInsertDeclaration(
           Mod, Intrinsic::objectsize,
           {Intr->getType(),
            PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS)});
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
index 4669bb45473cb0..cfce56f0bfe968 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
@@ -336,8 +336,8 @@ static void markUsedByKernel(Function *Func, GlobalVariable *SGV) {
   BasicBlock *Entry = &Func->getEntryBlock();
   IRBuilder<> Builder(Entry, Entry->getFirstNonPHIIt());
 
-  Function *Decl =
-      Intrinsic::getDeclaration(Func->getParent(), Intrinsic::donothing, {});
+  Function *Decl = Intrinsic::getOrInsertDeclaration(Func->getParent(),
+                                                     Intrinsic::donothing, {});
 
   Value *UseInstance[1] = {
       Builder.CreateConstInBoundsGEP1_32(SGV->getValueType(), SGV, 0)};
@@ -922,7 +922,8 @@ void AMDGPUSwLowerLDS::lowerKernelLDSAccesses(Function *Func,
       StringRef("__asan_free_impl"),
       FunctionType::get(IRB.getVoidTy(), {Int64Ty, Int64Ty}, false));
   Value *ReturnAddr = IRB.CreateCall(
-      Intrinsic::getDeclaration(&M, Intrinsic::returnaddress), IRB.getInt32(0));
+      Intrinsic::getOrInsertDeclaration(&M, Intrinsic::returnaddress),
+      IRB.getInt32(0));
   Value *RAPToInt = IRB.CreatePtrToInt(ReturnAddr, Int64Ty);
   Value *MallocPtrToInt = IRB.CreatePtrToInt(LoadMallocPtr, Int64Ty);
   IRB.CreateCall(AsanFreeFunc, {MallocPtrToInt, RAPToInt});
@@ -1055,8 +1056,8 @@ void AMDGPUSwLowerLDS::lowerNonKernelLDSAccesses(
   SetVector<Instruction *> LDSInstructions;
   getLDSMemoryInstructions(Func, LDSInstructions);
 
-  Function *Decl =
-      Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_lds_kernel_id, {});
+  Function *Decl = Intrinsic::getOrInsertDeclaration(
+      &M, Intrinsic::amdgcn_lds_kernel_id, {});
   auto *KernelId = IRB.CreateCall(Decl, {});
   GlobalVariable *LDSBaseTable = NKLDSParams.LDSBaseTable;
   GlobalVariable *LDSOffsetTable = NKLDSParams.LDSOffsetTable;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index d701bf037fdfa6..5d7ca89571b27b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1112,8 +1112,8 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
     if (!AMDGPU::isExtendedGlobalAddrSpace(NewAS))
       return nullptr;
     Module *M = II->getModule();
-    Function *NewDecl = Intrinsic::getDeclaration(M, II->getIntrinsicID(),
-                                                  {DestTy, SrcTy, DestTy});
+    Function *NewDecl = Intrinsic::getOrInsertDeclaration(
+        M, II->getIntrinsicID(), {DestTy, SrcTy, DestTy});
     II->setArgOperand(0, NewV);
     II->setCalledFunction(NewDecl);
     return II;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
index 3758c768b8673f..59cc61e347bc0a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -295,8 +295,8 @@ bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT,
       // Remove and delete the unreachable inst.
       UnreachableBlock->getTerminator()->eraseFromParent();
 
-      Function *UnreachableIntrin =
-        Intrinsic::getDeclaration(F.getParent(), Intrinsic::amdgcn_unreachable);
+      Function *UnreachableIntrin = Intrinsic::getOrInsertDeclaration(
+          F.getParent(), Intrinsic::amdgcn_unreachable);
 
       // Insert a call to an intrinsic tracking that this is an unreachable
       // point, in case we want to kill the active lanes or something later.
diff --git a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index edd881c84078c6..a7f2b66e3cd116 100644
--- a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -117,13 +117,15 @@ void SIAnnotateControlFlow::initialize(Module &M, const GCNSubtarget &ST) {
   BoolUndef = PoisonValue::get(Boolean);
   IntMaskZero = ConstantInt::get(IntMask, 0);
 
-  If = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if, { IntMask });
-  Else = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_else,
-                                   { IntMask, IntMask });
-  IfBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if_break,
-                                      { IntMask });
-  Loop = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_loop, { IntMask });
-  EndCf = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_end_cf, { IntMask });
+  If = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::amdgcn_if, {IntMask});
+  Else = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::amdgcn_else,
+                                           {IntMask, IntMask});
+  IfBreak = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::amdgcn_if_break,
+                                              {IntMask});
+  Loop =
+      Intrinsic::getOrInsertDeclaration(&M, Intrinsic::amdgcn_loop, {IntMask});
+  EndCf = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::amdgcn_end_cf,
+                                            {IntMask});
 }
 
 /// Is the branch condition uniform or did the StructurizeCFG pass
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index bf757edfa85890..a35582bebb08a3 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -21149,7 +21149,7 @@ Instruction *ARMTargetLowering::makeDMB(IRBuilderBase &Builder,
     // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
     // here.
     if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {
-      Function *MCR = Intrinsic::getDeclaration(M, Intrinsic::arm_mcr);
+      Function *MCR = Intrinsic::getOrInsertDeclaration(M, Intrinsic::arm_mcr);
       Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0),
                         Builder.getInt32(0), Builder.getInt32(7),
                         Builder.getInt32(10), Builder.getInt32(5)};
@@ -21160,7 +21160,7 @@ Instruction *ARMTargetLowering::makeDMB(IRBuilderBase &Builder,
       llvm_unreachable("makeDMB on a target so old that it has no barriers");
     }
   } else {
-    Function *DMB = Intrinsic::getDeclaration(M, Intrinsic::arm_dmb);
+    Function *DMB = Intrinsic::getOrInsertDeclaration(M, Intrinsic::arm_dmb);
     // Only a full system barrier exists in the M-class architectures.
     Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain;
     Constant *CDomain = Builder.getInt32(Domain);
@@ -21417,7 +21417,7 @@ Value *ARMTargetLowering::emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy,
   if (ValueTy->getPrimitiveSizeInBits() == 64) {
     Intrinsic::ID Int =
         IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
-    Function *Ldrex = Intrinsic::getDeclaration(M, Int);
+    Function *Ldrex = Intrinsic::getOrInsertDeclaration(M, Int);
 
     Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi");
 
@@ -21433,7 +21433,7 @@ Value *ARMTargetLowering::emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy,
 
   Type *Tys[] = { Addr->getType() };
   Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
-  Function *Ldrex = Intrinsic::getDeclaration(M, Int, Tys);
+  Function *Ldrex = Intrinsic::getOrInsertDeclaration(M, Int, Tys);
   CallInst *CI = Builder.CreateCall(Ldrex, Addr);
 
   CI->addParamAttr(
@@ -21446,7 +21446,8 @@ void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
   if (!Subtarget->hasV7Ops())
     return;
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::arm_clrex));
+  Builder.CreateCall(
+      Intrinsic::getOrInsertDeclaration(M, Intrinsic::arm_clrex));
 }
 
 Value *ARMTargetLowering::emitStoreConditional(IRBuilderBase &Builder,
@@ -21461,7 +21462,7 @@ Value *ARMTargetLowering::emitStoreConditional(IRBuilderBase &Builder,
   if (Val->getType()->getPrimitiveSizeInBits() == 64) {
     Intrinsic::ID Int =
         IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
-    Function *Strex = Intrinsic::getDeclaration(M, Int);
+    Function *Strex = Intrinsic::getOrInsertDeclaration(M, Int);
     Type *Int32Ty = Type::getInt32Ty(M->getContext());
 
     Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");
@@ -21473,7 +21474,7 @@ Value *ARMTargetLowering::emitStoreConditional(IRBuilderBase &Builder,
 
   Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
   Type *Tys[] = { Addr->getType() };
-  Function *Strex = Intrinsic::getDeclaration(M, Int, Tys);
+  Function *Strex = Intrinsic::getOrInsertDeclaration(M, Int, Tys);
 
   CallInst *CI = Builder.CreateCall(
       Strex, {Builder.CreateZExtOrBitCast(
@@ -21601,8 +21602,8 @@ bool ARMTargetLowering::lowerInterleavedLoad(
       static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
                                                 Intrinsic::arm_neon_vld3,
                                                 Intrinsic::arm_neon_vld4};
-      Function *VldnFunc =
-          Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
+      Function *VldnFunc = Intrinsic::getOrInsertDeclaration(
+          LI->getModule(), LoadInts[Factor - 2], Tys);
 
       SmallVector<Value *, 2> Ops;
       Ops.push_back(BaseAddr);
@@ -21617,7 +21618,7 @@ bool ARMTargetLowering::lowerInterleavedLoad(
       Type *PtrTy = Builder.getPtrTy(LI->getPointerAddressSpace());
       Type *Tys[] = {VecTy, PtrTy};
       Function *VldnFunc =
-          Intrinsic::getDeclaration(LI->getModule(), LoadInts, Tys);
+          Intrinsic::getOrInsertDeclaration(LI->getModule(), LoadInts, Tys);
 
       SmallVector<Value *, 2> Ops;
       Ops.push_back(BaseAddr);
@@ -21762,7 +21763,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
       Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace());
       Type *Tys[] = {PtrTy, SubVecTy};
 
-      Function *VstNFunc = Intrinsic::getDeclaration(
+      Function *VstNFunc = Intrinsic::getOrInsertDeclaration(
           SI->getModule(), StoreInts[Factor - 2], Tys);
 
       SmallVector<Value *, 6> Ops;
@@ -21778,7 +21779,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
       Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace());
       Type *Tys[] = {PtrTy, SubVecTy};
       Function *VstNFunc =
-          Intrinsic::getDeclaration(SI->getModule(), StoreInts, Tys);
+          Intrinsic::getOrInsertDeclaration(SI->getModule(), StoreInts, Tys);
 
       SmallVector<Value *, 6> Ops;
       Ops.push_back(BaseAddr);
diff --git a/llvm/lib/Target/ARM/ARMParallelDSP.cpp b/llvm/lib/Target/ARM/ARMParallelDSP.cpp
index 861d60d3bcce95..7804725ce77319 100644
--- a/llvm/lib/Target/ARM/ARMParallelDSP.cpp
+++ b/llvm/lib/Target/ARM/ARMParallelDSP.cpp
@@ -630,13 +630,14 @@ void ARMParallelDSP::InsertParallelMACs(Reduction &R) {
     Value* Args[] = { WideLd0, WideLd1, Acc };
     Function *SMLAD = nullptr;
     if (Exchange)
-      SMLAD = Acc->getType()->isIntegerTy(32) ?
-        Intrinsic::getDeclaration(M, Intrinsic::arm_smladx) :
-        Intrinsic::getDeclaration(M, Intrinsic::arm_smlaldx);
+      SMLAD =
+          Acc->getType()->isIntegerTy(32)
+              ? Intrinsic::getOrInsertDeclaration(M, Intrinsic::arm_smladx)
+              : Intrinsic::getOrInsertDeclaration(M, Intrinsic::arm_smlaldx);
     else
-      SMLAD = Acc->getType()->isIntegerTy(32) ?
-        Intrinsic::getDeclaration(M, Intrinsic::arm_smlad) :
-        Intrinsic::getDeclaration(M, Intrinsic::arm_smlald);
+      SMLAD = Acc->getType()->isIntegerTy(32)
+                  ? Intrinsic::getOrInsertDeclaration(M, Intrinsic::arm_smlad)
+                  : Intrinsic::getOrInsertDeclaration(M, Intrinsic::arm_smlald);
 
     IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
                                 BasicBlock::iterator(InsertAfter));
diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp
index e554e4d428d46f..60211db8a61ae3 100644
--- a/llvm/lib/Target/ARM/MVETailPredication.cpp
+++ b/llvm/lib/Target/ARM/MVETailPredication.cpp
@@ -401,7 +401,7 @@ void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask,
   case 8:  VCTPID = Intrinsic::arm_mve_vctp16; break;
   case 16: VCTPID = Intrinsic::arm_mve_vctp8; break;
   }
-  Function *VCTP = Intrinsic::getDeclaration(M, VCTPID);
+  Function *VCTP = Intrinsic::getOrInsertDeclaration(M, VCTPID);
   Value *VCTPCall = Builder.CreateCall(VCTP, Processed);
   ActiveLaneMask->replaceAllUsesWith(VCTPCall);
 
diff --git a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
index 4be6220b358ba3..7921518166f97d 100644
--- a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
+++ b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
@@ -103,7 +103,7 @@ uint32_t BPFCoreSharedInfo::SeqNum;
 Instruction *BPFCoreSharedInfo::insertPassThrough(Module *M, BasicBlock *BB,
                                                   Instruction *Input,
                                                   Instruction *Before) {
-  Function *Fn = Intrinsic::getDeclaration(
+  Function *Fn = Intrinsic::getOrInsertDeclaration(
       M, Intrinsic::bpf_passthrough, {Input->getType(), Input->getType()});
   Constant *SeqNumVal = ConstantInt::get(Type::getInt32Ty(BB->getContext()),
                                          BPFCoreSharedInfo::SeqNum++);
diff --git a/llvm/lib/Target/BPF/BPFAdjustOpt.cpp b/llvm/lib/Target/BPF/BPFAdjustOpt.cpp
index 4ab0cbcc924779..4ca7bbe9c2a8c4 100644
--- a/llvm/lib/Target/BPF/BPFAdjustOpt.cpp
+++ b/llvm/lib/Target/BPF/BPFAdjustOpt.cpp
@@ -126,7 +126,7 @@ bool BPFAdjustOptImpl::adjustICmpToBuiltin() {
 
         Constant *Opcode =
             ConstantInt::get(Type::getInt32Ty(BB.getContext()), Op);
-        Function *Fn = Intrinsic::getDeclaration(
+        Function *Fn = Intrinsic::getOrInsertDeclaration(
             M, Intrinsic::bpf_compare, {Op0->getType(), ConstOp1->getType()});
         auto *NewInst = CallInst::Create(Fn, {Opcode, Op0, ConstOp1});
         NewInst->insertBefore(&I);
diff --git a/llvm/lib/Target/BPF/BPFPreserveStaticOffset.cpp b/llvm/lib/Target/BPF/BPFPreserveStaticOffset.cpp
index 5d8339b4a44cec..9f7e3414beb8e3 100644
--- a/llvm/lib/Target/BPF/BPFPreserveStaticOffset.cpp
+++ b/llvm/lib/Target/BPF/BPFPreserveStaticOffset.cpp
@@ -163,7 +163,7 @@ static CallInst *makeIntrinsicCall(Module *M,
                                    ArrayRef<Type *> Types,
                                    ArrayRef<Value *> Args) {
 
-  Function *Fn = Intrinsic::getDeclaration(M, Intrinsic, Types);
+  Function *Fn = Intrinsic::getOrInsertDeclaration(M, Intrinsic, Types);
   return CallInst::Create(Fn, Args);
 }
 
diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
index c0f8d433833ee7..99df4850872078 100644
--- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
@@ -134,8 +134,8 @@ class OpLowerer {
   /// piecemeal way - we can add the casts in to avoid updating all of the uses
   /// or defs, and by the end all of the casts will be redundant.
   Value *createTmpHandleCast(Value *V, Type *Ty) {
-    Function *CastFn = Intrinsic::getDeclaration(&M, Intrinsic::dx_cast_handle,
-                                                 {Ty, V->getType()});
+    Function *CastFn = Intrinsic::getOrInsertDeclaration(
+        &M, Intrinsic::dx_cast_handle, {Ty, V->getType()});
     CallInst *Cast = OpBuilder.getIRB().CreateCall(CastFn, {V});
     CleanupCasts.push_back(Cast);
     return Cast;
diff --git a/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp b/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp
index 3274f9162b543a..65bbb1364488f7 100644
--- a/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp
@@ -212,7 +212,7 @@ bool HexagonGenExtract::convert(Instruction *In) {
   Intrinsic::ID IntId = (BW == 32) ? Intrinsic::hexagon_S2_extractu
                                    : Intrinsic::hexagon_S2_extractup;
   Module *Mod = BB->getParent()->getParent();
-  Function *ExtF = Intrinsic::getDeclaration(Mod, IntId);
+  Function *ExtF = Intrinsic::getOrInsertDeclaration(Mod, IntId);
   Value *NewIn = IRB.CreateCall(ExtF, {BF, IRB.getInt32(W), IRB.getInt32(SR)});
   if (SL != 0)
     NewIn = IRB.CreateShl(NewIn, SL, CSL->getName());
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index 856c952e785dac..03c12f5ce44707 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -3865,7 +3865,7 @@ Value *HexagonTargetLowering::emitLoadLinked(IRBuilderBase &Builder,
   assert((SZ == 32 || SZ == 64) && "Only 32/64-bit atomic loads supported");
   Intrinsic::ID IntID = (SZ == 32) ? Intrinsic::hexagon_L2_loadw_locked
                                    : Intrinsic::hexagon_L4_loadd_locked;
-  Function *Fn = Intrinsic::getDeclaration(M, IntID);
+  Function *Fn = Intrinsic::getOrInsertDeclaration(M, IntID);
 
   Value *Call = Builder.CreateCall(Fn, Addr, "larx");
 
@@ -3886,7 +3886,7 @@ Value *HexagonTargetLowering::emitStoreConditional(IRBuilderBase &Builder,
   assert((SZ == 32 || SZ == 64) && "Only 32/64-bit atomic stores supported");
   Intrinsic::ID IntID = (SZ == 32) ? Intrinsic::hexagon_S2_storew_locked
                                    : Intrinsic::hexagon_S4_stored_locked;
-  Function *Fn = Intrinsic::getDeclaration(M, IntID);
+  Function *Fn = Intrinsic::getOrInsertDeclaration(M, IntID);
 
   Val = Builder.CreateBitCast(Val, CastTy);
 
diff --git a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index 4ef009c87a1e63..705e1f43851f7a 100644
--- a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -1532,7 +1532,8 @@ Value *PolynomialMultiplyRecognize::generate(BasicBlock::iterator At,
       ParsedValues &PV) {
   IRBuilder<> B(&*At);
   Module *M = At->getParent()->getParent()->getParent();
-  Function *PMF = Intrinsic::getDeclaration(M, Intrinsic::hexagon_M4_pmpyw);
+  Function *PMF =
+      Intrinsic::getOrInsertDeclaration(M, Intrinsic::hexagon_M4_pmpyw);
 
   Value *P = PV.P, *Q = PV.Q, *P0 = P;
   unsigned IC = PV.IterCount;
diff --git a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
index f4e495266eae3f..d2cfd3851e711d 100644
--- a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
@@ -2390,8 +2390,8 @@ auto HexagonVectorCombine::vralignb(IRBuilderBase &Builder, Value *Lo,
     Type *Int64Ty = Type::getInt64Ty(F.getContext());
     Value *Lo64 = Builder.CreateBitCast(Lo, Int64Ty, "cst");
     Value *Hi64 = Builder.CreateBitCast(Hi, Int64Ty, "cst");
-    Function *FI = Intrinsic::getDeclaration(F.getParent(),
-                                             Intrinsic::hexagon_S2_valignrb);
+    Function *FI = Intrinsic::getOrInsertDeclaration(
+        F.getParent(), Intrinsic::hexagon_S2_valignrb);
     Value *Call = Builder.CreateCall(FI, {Hi64, Lo64, Amt}, "cup");
     return Builder.CreateBitCast(Call, Lo->getType(), "cst");
   }
@@ -2587,12 +2587,13 @@ auto HexagonVectorCombine::createHvxIntrinsic(IRBuilderBase &Builder,
     unsigned HwLen = HST.getVectorLength();
     Intrinsic::ID TC = HwLen == 64 ? Intrinsic::hexagon_V6_pred_typecast
                                    : Intrinsic::hexagon_V6_pred_typecast_128B;
-    Function *FI =
-        Intrinsic::getDeclaration(F.getParent(), TC, {DestTy, Val->getType()});
+    Function *FI = Intrinsic::getOrInsertDeclaration(F.getParent(), TC,
+                                                     {DestTy, Val->getType()});
     return Builder.CreateCall(FI, {Val}, "cup");
   };
 
-  Function *IntrFn = Intrinsic::getDeclaration(F.getParent(), IntID, ArgTys);
+  Function *IntrFn =
+      Intrinsic::getOrInsertDeclaration(F.getParent(), IntID, ArgTys);
   FunctionType *IntrTy = IntrFn->getFunctionType();
 
   SmallVector<Value *, 4> IntrArgs;
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index bfafb331752108..8edca34624e9b2 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -5808,7 +5808,7 @@ Value *LoongArchTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
   Mask = Builder.CreateSExt(Mask, Builder.getInt64Ty());
   Type *Tys[] = {AlignedAddr->getType()};
   Function *MaskedCmpXchg =
-      Intrinsic::getDeclaration(CI->getModule(), CmpXchgIntrID, Tys);
+      Intrinsic::getOrInsertDeclaration(CI->getModule(), CmpXchgIntrID, Tys);
   Value *Result = Builder.CreateCall(
       MaskedCmpXchg, {AlignedAddr, CmpVal, NewVal, Mask, FailureOrdering});
   Result = Builder.CreateTrunc(Result, Builder.getInt32Ty());
@@ -5838,7 +5838,7 @@ Value *LoongArchTargetLowering::emitMaskedAtomicRMWIntrinsic(
   Value *Ordering =
       Builder.getIntN(GRLen, static_cast<uint64_t>(AI->getOrdering()));
   Type *Tys[] = {AlignedAddr->getType()};
-  Function *LlwOpScwLoop = Intrinsic::getDeclaration(
+  Function *LlwOpScwLoop = Intrinsic::getOrInsertDeclaration(
       AI->getModule(),
       getIntrinsicForMaskedAtomicRMWBinOp(GRLen, AI->getOperation()), Tys);
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index 082546c4dd72f8..1e30e0113e43c7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -282,7 +282,7 @@ static void convertToParamAS(Use *OldUse, Value *Param, bool HasCvtaParam,
           [](Value *Addr, Instruction *OriginalUser) -> Value * {
         PointerType *ReturnTy =
             PointerType::get(OriginalUser->getContext(), ADDRESS_SPACE_GENERIC);
-        Function *CvtToGen = Intrinsic::getDeclaration(
+        Function *CvtToGen = Intrinsic::getOrInsertDeclaration(
             OriginalUser->getModule(), Intrinsic::nvvm_ptr_param_to_gen,
             {ReturnTy, PointerType::get(OriginalUser->getContext(),
                                         ADDRESS_SPACE_PARAM)});
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index 9a8ea8f87896ad..b141229dcfc733 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -360,7 +360,8 @@ static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) {
     // type argument, equal to that of the nvvm intrinsic's argument.
     Type *Tys[] = {II->getArgOperand(0)->getType()};
     return CallInst::Create(
-        Intrinsic::getDeclaration(II->getModule(), *Action.IID, Tys), Args);
+        Intrinsic::getOrInsertDeclaration(II->getModule(), *Action.IID, Tys),
+        Args);
   }
 
   // Simplify to target-generic binary op.
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index d9847a21489e63..911d92f0c4846b 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -12181,7 +12181,7 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
 
 static Instruction *callIntrinsic(IRBuilderBase &Builder, Intrinsic::ID Id) {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  Function *Func = Intrinsic::getDeclaration(M, Id);
+  Function *Func = Intrinsic::getOrInsertDeclaration(M, Id);
   return Builder.CreateCall(Func, {});
 }
 
@@ -12206,7 +12206,7 @@ Instruction *PPCTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
     // and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification.
     if (isa<LoadInst>(Inst))
       return Builder.CreateCall(
-          Intrinsic::getDeclaration(
+          Intrinsic::getOrInsertDeclaration(
               Builder.GetInsertBlock()->getParent()->getParent(),
               Intrinsic::ppc_cfence, {Inst->getType()}),
           {Inst});
@@ -19005,7 +19005,7 @@ Value *PPCTargetLowering::emitMaskedAtomicRMWIntrinsic(
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   Type *ValTy = Incr->getType();
   assert(ValTy->getPrimitiveSizeInBits() == 128);
-  Function *RMW = Intrinsic::getDeclaration(
+  Function *RMW = Intrinsic::getOrInsertDeclaration(
       M, getIntrinsicForAtomicRMWBinOp128(AI->getOperation()));
   Type *Int64Ty = Type::getInt64Ty(M->getContext());
   Value *IncrLo = Builder.CreateTrunc(Incr, Int64Ty, "incr_lo");
@@ -19028,7 +19028,7 @@ Value *PPCTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
   Type *ValTy = CmpVal->getType();
   assert(ValTy->getPrimitiveSizeInBits() == 128);
   Function *IntCmpXchg =
-      Intrinsic::getDeclaration(M, Intrinsic::ppc_cmpxchg_i128);
+      Intrinsic::getOrInsertDeclaration(M, Intrinsic::ppc_cmpxchg_i128);
   Type *Int64Ty = Type::getInt64Ty(M->getContext());
   Value *CmpLo = Builder.CreateTrunc(CmpVal, Int64Ty, "cmp_lo");
   Value *CmpHi =
diff --git a/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp b/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp
index d10fe11bb5877b..9c2b58a47392f9 100644
--- a/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp
+++ b/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp
@@ -123,7 +123,7 @@ bool PPCLowerMASSVEntries::handlePowSpecialCases(CallInst *CI, Function &Func,
         return false;
 
       CI->setCalledFunction(
-          Intrinsic::getDeclaration(&M, Intrinsic::pow, CI->getType()));
+          Intrinsic::getOrInsertDeclaration(&M, Intrinsic::pow, CI->getType()));
       return true;
     }
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 230ccd8209e1f2..1f9fc984515cf6 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -20608,7 +20608,7 @@ Value *RISCVTargetLowering::emitMaskedAtomicRMWIntrinsic(
   Value *Ordering =
       Builder.getIntN(XLen, static_cast<uint64_t>(AI->getOrdering()));
   Type *Tys[] = {AlignedAddr->getType()};
-  Function *LrwOpScwLoop = Intrinsic::getDeclaration(
+  Function *LrwOpScwLoop = Intrinsic::getOrInsertDeclaration(
       AI->getModule(),
       getIntrinsicForMaskedAtomicRMWBinOp(XLen, AI->getOperation()), Tys);
 
@@ -20672,7 +20672,7 @@ Value *RISCVTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
   }
   Type *Tys[] = {AlignedAddr->getType()};
   Function *MaskedCmpXchg =
-      Intrinsic::getDeclaration(CI->getModule(), CmpXchgIntrID, Tys);
+      Intrinsic::getOrInsertDeclaration(CI->getModule(), CmpXchgIntrID, Tys);
   Value *Result = Builder.CreateCall(
       MaskedCmpXchg, {AlignedAddr, CmpVal, NewVal, Mask, Ordering});
   if (XLen == 64)
@@ -21170,7 +21170,7 @@ bool RISCVTargetLowering::preferScalarizeSplat(SDNode *N) const {
 static Value *useTpOffset(IRBuilderBase &IRB, unsigned Offset) {
   Module *M = IRB.GetInsertBlock()->getModule();
   Function *ThreadPointerFunc =
-      Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
+      Intrinsic::getOrInsertDeclaration(M, Intrinsic::thread_pointer);
   return IRB.CreateConstGEP1_32(IRB.getInt8Ty(),
                                 IRB.CreateCall(ThreadPointerFunc), Offset);
 }
@@ -21287,9 +21287,9 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
 
   auto *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen());
 
-  Function *VlsegNFunc =
-      Intrinsic::getDeclaration(LI->getModule(), FixedVlsegIntrIds[Factor - 2],
-                                {VTy, LI->getPointerOperandType(), XLenTy});
+  Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration(
+      LI->getModule(), FixedVlsegIntrIds[Factor - 2],
+      {VTy, LI->getPointerOperandType(), XLenTy});
 
   Value *VL = ConstantInt::get(XLenTy, VTy->getNumElements());
 
@@ -21341,9 +21341,9 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
 
   auto *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen());
 
-  Function *VssegNFunc =
-      Intrinsic::getDeclaration(SI->getModule(), FixedVssegIntrIds[Factor - 2],
-                                {VTy, SI->getPointerOperandType(), XLenTy});
+  Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
+      SI->getModule(), FixedVssegIntrIds[Factor - 2],
+      {VTy, SI->getPointerOperandType(), XLenTy});
 
   auto Mask = SVI->getShuffleMask();
   SmallVector<Value *, 10> Ops;
@@ -21388,7 +21388,7 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
   Type *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen());
 
   if (auto *FVTy = dyn_cast<FixedVectorType>(ResVTy)) {
-    Function *VlsegNFunc = Intrinsic::getDeclaration(
+    Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration(
         LI->getModule(), FixedVlsegIntrIds[Factor - 2],
         {ResVTy, LI->getPointerOperandType(), XLenTy});
     Value *VL = ConstantInt::get(XLenTy, FVTy->getNumElements());
@@ -21408,7 +21408,7 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
                                 NumElts * SEW / 8),
         Factor);
 
-    Function *VlsegNFunc = Intrinsic::getDeclaration(
+    Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration(
         LI->getModule(), IntrIds[Factor - 2], {VecTupTy, XLenTy});
     Value *VL = Constant::getAllOnesValue(XLenTy);
 
@@ -21418,7 +21418,7 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
 
     SmallVector<Type *, 2> AggrTypes{Factor, ResVTy};
     Return = PoisonValue::get(StructType::get(LI->getContext(), AggrTypes));
-    Function *VecExtractFunc = Intrinsic::getDeclaration(
+    Function *VecExtractFunc = Intrinsic::getOrInsertDeclaration(
         LI->getModule(), Intrinsic::riscv_tuple_extract, {ResVTy, VecTupTy});
     for (unsigned i = 0; i < Factor; ++i) {
       Value *VecExtract =
@@ -21454,7 +21454,7 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
   Type *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen());
 
   if (auto *FVTy = dyn_cast<FixedVectorType>(InVTy)) {
-    Function *VssegNFunc = Intrinsic::getDeclaration(
+    Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
         SI->getModule(), FixedVssegIntrIds[Factor - 2],
         {InVTy, SI->getPointerOperandType(), XLenTy});
     Value *VL = ConstantInt::get(XLenTy, FVTy->getNumElements());
@@ -21475,12 +21475,12 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
                                 NumElts * SEW / 8),
         Factor);
 
-    Function *VssegNFunc = Intrinsic::getDeclaration(
+    Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
         SI->getModule(), IntrIds[Factor - 2], {VecTupTy, XLenTy});
 
     Value *VL = Constant::getAllOnesValue(XLenTy);
 
-    Function *VecInsertFunc = Intrinsic::getDeclaration(
+    Function *VecInsertFunc = Intrinsic::getOrInsertDeclaration(
         SI->getModule(), Intrinsic::riscv_tuple_insert, {VecTupTy, InVTy});
     Value *StoredVal = PoisonValue::get(VecTupTy);
     for (unsigned i = 0; i < Factor; ++i)
diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
index 1872b238d1077a..ecf9b6ddae1fc3 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
@@ -353,11 +353,11 @@ static void lowerExpectAssume(IntrinsicInst *II) {
   // We need to lower this into a builtin and then the builtin into a SPIR-V
   // instruction.
   if (II->getIntrinsicID() == Intrinsic::assume) {
-    Function *F = Intrinsic::getDeclaration(
+    Function *F = Intrinsic::getOrInsertDeclaration(
         II->getModule(), Intrinsic::SPVIntrinsics::spv_assume);
     II->setCalledFunction(F);
   } else if (II->getIntrinsicID() == Intrinsic::expect) {
-    Function *F = Intrinsic::getDeclaration(
+    Function *F = Intrinsic::getOrInsertDeclaration(
         II->getModule(), Intrinsic::SPVIntrinsics::spv_expect,
         {II->getOperand(0)->getType()});
     II->setCalledFunction(F);
@@ -372,12 +372,12 @@ static bool toSpvOverloadedIntrinsic(IntrinsicInst *II, Intrinsic::ID NewID,
                                      ArrayRef<unsigned> OpNos) {
   Function *F = nullptr;
   if (OpNos.empty()) {
-    F = Intrinsic::getDeclaration(II->getModule(), NewID);
+    F = Intrinsic::getOrInsertDeclaration(II->getModule(), NewID);
   } else {
     SmallVector<Type *, 4> Tys;
     for (unsigned OpNo : OpNos)
       Tys.push_back(II->getOperand(OpNo)->getType());
-    F = Intrinsic::getDeclaration(II->getModule(), NewID, Tys);
+    F = Intrinsic::getOrInsertDeclaration(II->getModule(), NewID, Tys);
   }
   II->setCalledFunction(F);
   return true;
diff --git a/llvm/lib/Target/SystemZ/SystemZTDC.cpp b/llvm/lib/Target/SystemZ/SystemZTDC.cpp
index f62afb8ddfcfae..345327e880ecd5 100644
--- a/llvm/lib/Target/SystemZ/SystemZTDC.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTDC.cpp
@@ -366,8 +366,8 @@ bool SystemZTDCPass::runOnFunction(Function &F) {
       if (!Worthy)
         continue;
       // Call the intrinsic, compare result with 0.
-      Function *TDCFunc =
-          Intrinsic::getDeclaration(&M, Intrinsic::s390_tdc, V->getType());
+      Function *TDCFunc = Intrinsic::getOrInsertDeclaration(
+          &M, Intrinsic::s390_tdc, V->getType());
       IRBuilder<> IRB(I);
       Value *MaskVal = ConstantInt::get(Type::getInt64Ty(Ctx), Mask);
       Instruction *TDC = IRB.CreateCall(TDCFunc, {V, MaskVal});
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
index c040e560be605f..b999f83507f4ce 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -1016,7 +1016,7 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) {
 
       // wasm.catch() will be lowered down to wasm 'catch' instruction in
       // instruction selection.
-      CatchF = Intrinsic::getDeclaration(&M, Intrinsic::wasm_catch);
+      CatchF = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::wasm_catch);
       // Type for struct __WasmLongjmpArgs
       LongjmpArgsTy = StructType::get(Int8PtrTy, // env
                                       Int32Ty    // val
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerRefTypesIntPtrConv.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerRefTypesIntPtrConv.cpp
index 2594430d1d8f3a..c61aa5eff4a708 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerRefTypesIntPtrConv.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerRefTypesIntPtrConv.cpp
@@ -72,7 +72,7 @@ bool WebAssemblyLowerRefTypesIntPtrConv::runOnFunction(Function &F) {
     I->replaceAllUsesWith(U);
 
     Function *TrapIntrin =
-        Intrinsic::getDeclaration(F.getParent(), Intrinsic::debugtrap);
+        Intrinsic::getOrInsertDeclaration(F.getParent(), Intrinsic::debugtrap);
     CallInst::Create(TrapIntrin, {}, "", I->getIterator());
 
     worklist.insert(&*I);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7a6d20c6a121b6..de88db22279797 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -31163,12 +31163,14 @@ void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
   if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {
     auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0));
 
-    BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_C, AI->getType());
+    BitTest = Intrinsic::getOrInsertDeclaration(AI->getModule(), IID_C,
+                                                AI->getType());
 
     unsigned Imm = llvm::countr_zero(C->getZExtValue());
     Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)});
   } else {
-    BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_I, AI->getType());
+    BitTest = Intrinsic::getOrInsertDeclaration(AI->getModule(), IID_I,
+                                                AI->getType());
 
     assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit);
 
@@ -31328,7 +31330,7 @@ void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic(
     break;
   }
   Function *CmpArith =
-      Intrinsic::getDeclaration(AI->getModule(), IID, AI->getType());
+      Intrinsic::getOrInsertDeclaration(AI->getModule(), IID, AI->getType());
   Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
                                           PointerType::getUnqual(Ctx));
   Value *Call = Builder.CreateCall(
@@ -31444,7 +31446,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
     return nullptr;
 
   Function *MFence =
-      llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
+      llvm::Intrinsic::getOrInsertDeclaration(M, Intrinsic::x86_sse2_mfence);
   Builder.CreateCall(MFence, {});
 
   // Finally we can emit the atomic load.
diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
index 77139f38c977bb..c4374984da4b9e 100644
--- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
@@ -1876,7 +1876,8 @@ static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
     if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
       Value *Args[] = {Op0, CILength, CIIndex};
       Module *M = II.getModule();
-      Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi);
+      Function *F =
+          Intrinsic::getOrInsertDeclaration(M, Intrinsic::x86_sse4a_extrqi);
       return Builder.CreateCall(F, Args);
     }
   }
@@ -1975,7 +1976,8 @@ static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
 
     Value *Args[] = {Op0, Op1, CILength, CIIndex};
     Module *M = II.getModule();
-    Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
+    Function *F =
+        Intrinsic::getOrInsertDeclaration(M, Intrinsic::x86_sse4a_insertqi);
     return Builder.CreateCall(F, Args);
   }
 
diff --git a/llvm/lib/Target/X86/X86PartialReduction.cpp b/llvm/lib/Target/X86/X86PartialReduction.cpp
index 5bbfabcbd67bc6..e88702caa9a52b 100644
--- a/llvm/lib/Target/X86/X86PartialReduction.cpp
+++ b/llvm/lib/Target/X86/X86PartialReduction.cpp
@@ -278,7 +278,7 @@ bool X86PartialReduction::trySADReplacement(Instruction *Op) {
     IntrinsicNumElts = 16;
   }
 
-  Function *PSADBWFn = Intrinsic::getDeclaration(Op->getModule(), IID);
+  Function *PSADBWFn = Intrinsic::getOrInsertDeclaration(Op->getModule(), IID);
 
   if (NumElts < 16) {
     // Pad input with zeroes.
diff --git a/llvm/lib/Target/X86/X86WinEHState.cpp b/llvm/lib/Target/X86/X86WinEHState.cpp
index 963d613ddbfe7d..05fc6f13129f24 100644
--- a/llvm/lib/Target/X86/X86WinEHState.cpp
+++ b/llvm/lib/Target/X86/X86WinEHState.cpp
@@ -334,7 +334,7 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) {
     if (UseStackGuard) {
       Value *Val = Builder.CreateLoad(Int32Ty, Cookie);
       Value *FrameAddr = Builder.CreateCall(
-          Intrinsic::getDeclaration(
+          Intrinsic::getOrInsertDeclaration(
               TheModule, Intrinsic::frameaddress,
               Builder.getPtrTy(
                   TheModule->getDataLayout().getAllocaAddrSpace())),
@@ -370,7 +370,7 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) {
 
 Value *WinEHStatePass::emitEHLSDA(IRBuilder<> &Builder, Function *F) {
   return Builder.CreateCall(
-      Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_lsda), F);
+      Intrinsic::getOrInsertDeclaration(TheModule, Intrinsic::x86_seh_lsda), F);
 }
 
 /// Generate a thunk that puts the LSDA of ParentFunc in EAX and then calls
@@ -624,17 +624,17 @@ void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) {
   // that it can recover the original frame pointer.
   IRBuilder<> Builder(RegNode->getNextNode());
   Value *RegNodeI8 = Builder.CreateBitCast(RegNode, Builder.getPtrTy());
-  Builder.CreateCall(
-      Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_ehregnode),
-      {RegNodeI8});
+  Builder.CreateCall(Intrinsic::getOrInsertDeclaration(
+                         TheModule, Intrinsic::x86_seh_ehregnode),
+                     {RegNodeI8});
 
   if (EHGuardNode) {
     IRBuilder<> Builder(EHGuardNode->getNextNode());
     Value *EHGuardNodeI8 =
         Builder.CreateBitCast(EHGuardNode, Builder.getPtrTy());
-    Builder.CreateCall(
-        Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_ehguard),
-        {EHGuardNodeI8});
+    Builder.CreateCall(Intrinsic::getOrInsertDeclaration(
+                           TheModule, Intrinsic::x86_seh_ehguard),
+                       {EHGuardNodeI8});
   }
 
   // Calculate state numbers.
diff --git a/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp b/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp
index 95962d1a0a240f..3604774ddf35bf 100644
--- a/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp
+++ b/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp
@@ -157,8 +157,8 @@ bool XCoreLowerThreadLocal::lowerGlobal(GlobalVariable *GV) {
   for (User *U : Users) {
     Instruction *Inst = cast<Instruction>(U);
     IRBuilder<> Builder(Inst);
-    Function *GetID = Intrinsic::getDeclaration(GV->getParent(),
-                                                Intrinsic::xcore_getid);
+    Function *GetID = Intrinsic::getOrInsertDeclaration(GV->getParent(),
+                                                        Intrinsic::xcore_getid);
     Value *ThreadID = Builder.CreateCall(GetID, {});
     Value *Addr = Builder.CreateInBoundsGEP(NewGV->getValueType(), NewGV,
                                             {Builder.getInt64(0), ThreadID});
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 9943c3cbb9fc7d..898d55fab2b00d 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -172,7 +172,8 @@ static bool foldGuardedFunnelShift(Instruction &I, const DominatorTree &DT) {
   //   %cond = phi i32 [ %fsh, %FunnelBB ], [ %ShVal0, %GuardBB ]
   // -->
   // llvm.fshl.i32(i32 %ShVal0, i32 %ShVal1, i32 %ShAmt)
-  Function *F = Intrinsic::getDeclaration(Phi.getModule(), IID, Phi.getType());
+  Function *F =
+      Intrinsic::getOrInsertDeclaration(Phi.getModule(), IID, Phi.getType());
   Phi.replaceAllUsesWith(Builder.CreateCall(F, {ShVal0, ShVal1, ShAmt}));
   return true;
 }
@@ -331,7 +332,7 @@ static bool tryToRecognizePopCount(Instruction &I) {
                                 m_SpecificInt(Mask55)))) {
           LLVM_DEBUG(dbgs() << "Recognized popcount intrinsic\n");
           IRBuilder<> Builder(&I);
-          Function *Func = Intrinsic::getDeclaration(
+          Function *Func = Intrinsic::getOrInsertDeclaration(
               I.getModule(), Intrinsic::ctpop, I.getType());
           I.replaceAllUsesWith(Builder.CreateCall(Func, {Root}));
           ++NumPopCountRecognized;
@@ -398,8 +399,8 @@ static bool tryToFPToSat(Instruction &I, TargetTransformInfo &TTI) {
     return false;
 
   IRBuilder<> Builder(&I);
-  Function *Fn = Intrinsic::getDeclaration(I.getModule(), Intrinsic::fptosi_sat,
-                                           {SatTy, FpTy});
+  Function *Fn = Intrinsic::getOrInsertDeclaration(
+      I.getModule(), Intrinsic::fptosi_sat, {SatTy, FpTy});
   Value *Sat = Builder.CreateCall(Fn, In);
   I.replaceAllUsesWith(Builder.CreateSExt(Sat, IntTy));
   return true;
@@ -431,7 +432,7 @@ static bool foldSqrt(CallInst *Call, LibFunc Func, TargetTransformInfo &TTI,
     IRBuilderBase::FastMathFlagGuard Guard(Builder);
     Builder.setFastMathFlags(Call->getFastMathFlags());
 
-    Function *Sqrt = Intrinsic::getDeclaration(M, Intrinsic::sqrt, Ty);
+    Function *Sqrt = Intrinsic::getOrInsertDeclaration(M, Intrinsic::sqrt, Ty);
     Value *NewSqrt = Builder.CreateCall(Sqrt, Arg, "sqrt");
     Call->replaceAllUsesWith(NewSqrt);
 
diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
index 1c45bcd7f6a837..45b9767657c66a 100644
--- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp
+++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
@@ -52,7 +52,8 @@ coro::LowererBase::LowererBase(Module &M)
 CallInst *coro::LowererBase::makeSubFnCall(Value *Arg, int Index,
                                            Instruction *InsertPt) {
   auto *IndexVal = ConstantInt::get(Type::getInt8Ty(Context), Index);
-  auto *Fn = Intrinsic::getDeclaration(&TheModule, Intrinsic::coro_subfn_addr);
+  auto *Fn =
+      Intrinsic::getOrInsertDeclaration(&TheModule, Intrinsic::coro_subfn_addr);
 
   assert(Index >= CoroSubFnInst::IndexFirst &&
          Index < CoroSubFnInst::IndexLast &&
@@ -183,7 +184,7 @@ void coro::suppressCoroAllocs(LLVMContext &Context,
 static CoroSaveInst *createCoroSave(CoroBeginInst *CoroBegin,
                                     CoroSuspendInst *SuspendInst) {
   Module *M = SuspendInst->getModule();
-  auto *Fn = Intrinsic::getDeclaration(M, Intrinsic::coro_save);
+  auto *Fn = Intrinsic::getOrInsertDeclaration(M, Intrinsic::coro_save);
   auto *SaveInst = cast<CoroSaveInst>(
       CallInst::Create(Fn, CoroBegin, "", SuspendInst->getIterator()));
   assert(!SuspendInst->getCoroSave());
diff --git a/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp b/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp
index 91d445dfc4c734..9e5d9ea31af6c4 100644
--- a/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp
+++ b/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp
@@ -125,7 +125,8 @@ void CrossDSOCFI::buildCFICheck(Module &M) {
     ConstantInt *CaseTypeId = ConstantInt::get(Type::getInt64Ty(Ctx), TypeId);
     BasicBlock *TestBB = BasicBlock::Create(Ctx, "test", F);
     IRBuilder<> IRBTest(TestBB);
-    Function *BitsetTestFn = Intrinsic::getDeclaration(&M, Intrinsic::type_test);
+    Function *BitsetTestFn =
+        Intrinsic::getOrInsertDeclaration(&M, Intrinsic::type_test);
 
     Value *Test = IRBTest.CreateCall(
         BitsetTestFn, {&Addr, MetadataAsValue::get(
diff --git a/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp b/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
index d84856f71c9de6..543987d5981bab 100644
--- a/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
@@ -401,7 +401,7 @@ void SampleProfileProber::instrumentOneFunc(Function &F, TargetMachine *TM) {
     assert(Builder.GetInsertPoint() != BB->end() &&
            "Cannot get the probing point");
     Function *ProbeFn =
-        llvm::Intrinsic::getDeclaration(M, Intrinsic::pseudoprobe);
+        llvm::Intrinsic::getOrInsertDeclaration(M, Intrinsic::pseudoprobe);
     Value *Args[] = {Builder.getInt64(Guid), Builder.getInt64(Index),
                      Builder.getInt32(0),
                      Builder.getInt64(PseudoProbeFullDistributionFactor)};
diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index 36a1841b363463..59f986b4ca2664 100644
--- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -856,7 +856,7 @@ void llvm::updatePublicTypeTestCalls(Module &M,
     return;
   if (hasWholeProgramVisibility(WholeProgramVisibilityEnabledInLTO)) {
     Function *TypeTestFunc =
-        Intrinsic::getDeclaration(&M, Intrinsic::type_test);
+        Intrinsic::getOrInsertDeclaration(&M, Intrinsic::type_test);
     for (Use &U : make_early_inc_range(PublicTypeTestFunc->uses())) {
       auto *CI = cast<CallInst>(U.getUser());
       auto *NewCI = CallInst::Create(
@@ -1187,7 +1187,8 @@ void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo,
         Instruction *ThenTerm =
             SplitBlockAndInsertIfThen(Cond, &CB, /*Unreachable=*/false);
         Builder.SetInsertPoint(ThenTerm);
-        Function *TrapFn = Intrinsic::getDeclaration(&M, Intrinsic::debugtrap);
+        Function *TrapFn =
+            Intrinsic::getOrInsertDeclaration(&M, Intrinsic::debugtrap);
         auto *CallTrap = Builder.CreateCall(TrapFn);
         CallTrap->setDebugLoc(CB.getDebugLoc());
       }
@@ -1434,8 +1435,8 @@ void DevirtModule::tryICallBranchFunnel(
   }
 
   BasicBlock *BB = BasicBlock::Create(M.getContext(), "", JT, nullptr);
-  Function *Intr =
-      Intrinsic::getDeclaration(&M, llvm::Intrinsic::icall_branch_funnel, {});
+  Function *Intr = Intrinsic::getOrInsertDeclaration(
+      &M, llvm::Intrinsic::icall_branch_funnel, {});
 
   auto *CI = CallInst::Create(Intr, JTArgs, "", BB);
   CI->setTailCallKind(CallInst::TCK_MustTail);
@@ -2026,7 +2027,8 @@ void DevirtModule::scanTypeTestUsers(
 }
 
 void DevirtModule::scanTypeCheckedLoadUsers(Function *TypeCheckedLoadFunc) {
-  Function *TypeTestFunc = Intrinsic::getDeclaration(&M, Intrinsic::type_test);
+  Function *TypeTestFunc =
+      Intrinsic::getOrInsertDeclaration(&M, Intrinsic::type_test);
 
   for (Use &U : llvm::make_early_inc_range(TypeCheckedLoadFunc->uses())) {
     auto *CI = dyn_cast<CallInst>(U.getUser());
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index e5c3a20e1a6487..21588aca512758 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1232,7 +1232,8 @@ static Instruction *foldToUnsignedSaturatedAdd(BinaryOperator &I) {
   assert(I.getOpcode() == Instruction::Add && "Expecting add instruction");
   Type *Ty = I.getType();
   auto getUAddSat = [&]() {
-    return Intrinsic::getDeclaration(I.getModule(), Intrinsic::uadd_sat, Ty);
+    return Intrinsic::getOrInsertDeclaration(I.getModule(), Intrinsic::uadd_sat,
+                                             Ty);
   };
 
   // add (umin X, ~Y), Y --> uaddsat X, Y
@@ -2127,7 +2128,7 @@ static Instruction *foldSubOfMinMax(BinaryOperator &I,
   if (match(Op0, m_c_Add(m_Specific(X), m_Specific(Y))) &&
       (Op0->hasOneUse() || Op1->hasOneUse())) {
     Intrinsic::ID InvID = getInverseMinMaxIntrinsic(MinMax->getIntrinsicID());
-    Function *F = Intrinsic::getDeclaration(I.getModule(), InvID, Ty);
+    Function *F = Intrinsic::getOrInsertDeclaration(I.getModule(), InvID, Ty);
     return CallInst::Create(F, {X, Y});
   }
 
@@ -2150,7 +2151,7 @@ static Instruction *foldSubOfMinMax(BinaryOperator &I,
   if (MinMax->isSigned() && match(Y, m_ZeroInt()) &&
       match(X, m_NSWSub(m_Specific(Op0), m_Value(Z)))) {
     Intrinsic::ID InvID = getInverseMinMaxIntrinsic(MinMax->getIntrinsicID());
-    Function *F = Intrinsic::getDeclaration(I.getModule(), InvID, Ty);
+    Function *F = Intrinsic::getOrInsertDeclaration(I.getModule(), InvID, Ty);
     return CallInst::Create(F, {Op0, Z});
   }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 964616a4eb35e2..453071f3f982cd 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -2269,7 +2269,8 @@ foldBitwiseLogicWithIntrinsics(BinaryOperator &I,
         Builder.CreateBinOp(I.getOpcode(), X->getOperand(0), Y->getOperand(0));
     Value *NewOp1 =
         Builder.CreateBinOp(I.getOpcode(), X->getOperand(1), Y->getOperand(1));
-    Function *F = Intrinsic::getDeclaration(I.getModule(), IID, I.getType());
+    Function *F =
+        Intrinsic::getOrInsertDeclaration(I.getModule(), IID, I.getType());
     return CallInst::Create(F, {NewOp0, NewOp1, X->getOperand(2)});
   }
   case Intrinsic::bswap:
@@ -2280,7 +2281,8 @@ foldBitwiseLogicWithIntrinsics(BinaryOperator &I,
           : ConstantInt::get(I.getType(), IID == Intrinsic::bswap
                                               ? RHSC->byteSwap()
                                               : RHSC->reverseBits()));
-    Function *F = Intrinsic::getDeclaration(I.getModule(), IID, I.getType());
+    Function *F =
+        Intrinsic::getOrInsertDeclaration(I.getModule(), IID, I.getType());
     return CallInst::Create(F, {NewOp0});
   }
   default:
@@ -3056,7 +3058,8 @@ InstCombinerImpl::convertOrOfShiftsToFunnelShift(Instruction &Or) {
 static Instruction *matchFunnelShift(Instruction &Or, InstCombinerImpl &IC) {
   if (auto Opt = IC.convertOrOfShiftsToFunnelShift(Or)) {
     auto [IID, FShiftArgs] = *Opt;
-    Function *F = Intrinsic::getDeclaration(Or.getModule(), IID, Or.getType());
+    Function *F =
+        Intrinsic::getOrInsertDeclaration(Or.getModule(), IID, Or.getType());
     return CallInst::Create(F, FShiftArgs);
   }
 
@@ -3095,7 +3098,7 @@ static Instruction *matchOrConcat(Instruction &Or,
     Value *NewUpper = Builder.CreateZExt(Hi, Ty);
     NewUpper = Builder.CreateShl(NewUpper, HalfWidth);
     Value *BinOp = Builder.CreateOr(NewLower, NewUpper);
-    Function *F = Intrinsic::getDeclaration(Or.getModule(), id, Ty);
+    Function *F = Intrinsic::getOrInsertDeclaration(Or.getModule(), id, Ty);
     return Builder.CreateCall(F, BinOp);
   };
 
@@ -4803,7 +4806,8 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
             match(II->getArgOperand(1), m_One()) &&
             isKnownToBeAPowerOfTwo(II->getArgOperand(0), /*OrZero */ true)) {
           IID = (IID == Intrinsic::ctlz) ? Intrinsic::cttz : Intrinsic::ctlz;
-          Function *F = Intrinsic::getDeclaration(II->getModule(), IID, Ty);
+          Function *F =
+              Intrinsic::getOrInsertDeclaration(II->getModule(), IID, Ty);
           return CallInst::Create(F, {II->getArgOperand(0), Builder.getTrue()});
         }
       }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index f7a9406791801c..51e09b7e7c1437 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -488,7 +488,8 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombinerImpl &IC) {
   // cttz(bitreverse(x)) -> ctlz(x)
   if (match(Op0, m_BitReverse(m_Value(X)))) {
     Intrinsic::ID ID = IsTZ ? Intrinsic::ctlz : Intrinsic::cttz;
-    Function *F = Intrinsic::getDeclaration(II.getModule(), ID, II.getType());
+    Function *F =
+        Intrinsic::getOrInsertDeclaration(II.getModule(), ID, II.getType());
     return CallInst::Create(F, {X, II.getArgOperand(1)});
   }
 
@@ -647,7 +648,7 @@ static Instruction *foldCtpop(IntrinsicInst &II, InstCombinerImpl &IC) {
   if (Op0->hasOneUse() &&
       match(Op0, m_c_Or(m_Value(X), m_Neg(m_Deferred(X))))) {
     Function *F =
-        Intrinsic::getDeclaration(II.getModule(), Intrinsic::cttz, Ty);
+        Intrinsic::getOrInsertDeclaration(II.getModule(), Intrinsic::cttz, Ty);
     auto *Cttz = IC.Builder.CreateCall(F, {X, IC.Builder.getFalse()});
     auto *Bw = ConstantInt::get(Ty, APInt(BitWidth, BitWidth));
     return IC.replaceInstUsesWith(II, IC.Builder.CreateSub(Bw, Cttz));
@@ -657,7 +658,7 @@ static Instruction *foldCtpop(IntrinsicInst &II, InstCombinerImpl &IC) {
   if (match(Op0,
             m_c_And(m_Not(m_Value(X)), m_Add(m_Deferred(X), m_AllOnes())))) {
     Function *F =
-        Intrinsic::getDeclaration(II.getModule(), Intrinsic::cttz, Ty);
+        Intrinsic::getOrInsertDeclaration(II.getModule(), Intrinsic::cttz, Ty);
     return CallInst::Create(F, {X, IC.Builder.getFalse()});
   }
 
@@ -1181,7 +1182,8 @@ Instruction *InstCombinerImpl::matchSAddSubSat(IntrinsicInst &MinMax1) {
     return nullptr;
 
   // Finally create and return the sat intrinsic, truncated to the new type
-  Function *F = Intrinsic::getDeclaration(MinMax1.getModule(), IntrinsicID, NewTy);
+  Function *F = Intrinsic::getOrInsertDeclaration(MinMax1.getModule(),
+                                                  IntrinsicID, NewTy);
   Value *AT = Builder.CreateTrunc(AddSub->getOperand(0), NewTy);
   Value *BT = Builder.CreateTrunc(AddSub->getOperand(1), NewTy);
   Value *Sat = Builder.CreateCall(F, {AT, BT});
@@ -1286,8 +1288,8 @@ reassociateMinMaxWithConstantInOperand(IntrinsicInst *II,
     return nullptr;
 
   // max (max X, C), Y --> max (max X, Y), C
-  Function *MinMax =
-      Intrinsic::getDeclaration(II->getModule(), MinMaxID, II->getType());
+  Function *MinMax = Intrinsic::getOrInsertDeclaration(II->getModule(),
+                                                       MinMaxID, II->getType());
   Value *NewInner = Builder.CreateBinaryIntrinsic(MinMaxID, X, Y);
   NewInner->takeName(Inner);
   return CallInst::Create(MinMax, {NewInner, C});
@@ -1346,7 +1348,8 @@ static Instruction *factorizeMinMaxTree(IntrinsicInst *II) {
     return nullptr;
 
   Module *Mod = II->getModule();
-  Function *MinMax = Intrinsic::getDeclaration(Mod, MinMaxID, II->getType());
+  Function *MinMax =
+      Intrinsic::getOrInsertDeclaration(Mod, MinMaxID, II->getType());
   return CallInst::Create(MinMax, { MinMaxOp, ThirdOp });
 }
 
@@ -1571,7 +1574,8 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
           Type *Tys[3] = { CI.getArgOperand(0)->getType(),
                            CI.getArgOperand(1)->getType(),
                            CI.getArgOperand(2)->getType() };
-          CI.setCalledFunction(Intrinsic::getDeclaration(M, MemCpyID, Tys));
+          CI.setCalledFunction(
+              Intrinsic::getOrInsertDeclaration(M, MemCpyID, Tys));
           Changed = true;
         }
     }
@@ -2095,7 +2099,8 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
 
         Constant *LeftShiftC = ConstantExpr::getSub(WidthC, ShAmtC);
         Module *Mod = II->getModule();
-        Function *Fshl = Intrinsic::getDeclaration(Mod, Intrinsic::fshl, Ty);
+        Function *Fshl =
+            Intrinsic::getOrInsertDeclaration(Mod, Intrinsic::fshl, Ty);
         return CallInst::Create(Fshl, { Op0, Op1, LeftShiftC });
       }
       assert(IID == Intrinsic::fshl &&
@@ -2115,7 +2120,8 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       // fshl i16 X, X, 8 --> bswap i16 X (reduce to more-specific form)
       if (Op0 == Op1 && BitWidth == 16 && match(ShAmtC, m_SpecificInt(8))) {
         Module *Mod = II->getModule();
-        Function *Bswap = Intrinsic::getDeclaration(Mod, Intrinsic::bswap, Ty);
+        Function *Bswap =
+            Intrinsic::getOrInsertDeclaration(Mod, Intrinsic::bswap, Ty);
         return CallInst::Create(Bswap, { Op0 });
       }
       if (Instruction *BitOp =
@@ -2824,7 +2830,8 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       CallArgs.push_back(II->getArgOperand(4));
     }
 
-    Function *NewFn = Intrinsic::getDeclaration(II->getModule(), NewIntrin);
+    Function *NewFn =
+        Intrinsic::getOrInsertDeclaration(II->getModule(), NewIntrin);
     return CallInst::Create(NewFn, CallArgs);
   }
   case Intrinsic::arm_neon_vtbl1:
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 9934c065ebf85f..6c2554ea73b7f8 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -95,8 +95,8 @@ Value *InstCombinerImpl::EvaluateInDifferentType(Value *V, Type *Ty,
       default:
         llvm_unreachable("Unsupported call!");
       case Intrinsic::vscale: {
-        Function *Fn =
-            Intrinsic::getDeclaration(I->getModule(), Intrinsic::vscale, {Ty});
+        Function *Fn = Intrinsic::getOrInsertDeclaration(
+            I->getModule(), Intrinsic::vscale, {Ty});
         Res = CallInst::Create(Fn->getFunctionType(), Fn);
         break;
       }
@@ -600,7 +600,8 @@ Instruction *InstCombinerImpl::narrowFunnelShift(TruncInst &Trunc) {
   if (ShVal0 != ShVal1)
     Y = Builder.CreateTrunc(ShVal1, DestTy);
   Intrinsic::ID IID = IsFshl ? Intrinsic::fshl : Intrinsic::fshr;
-  Function *F = Intrinsic::getDeclaration(Trunc.getModule(), IID, DestTy);
+  Function *F =
+      Intrinsic::getOrInsertDeclaration(Trunc.getModule(), IID, DestTy);
   return CallInst::Create(F, {X, Y, NarrowShAmt});
 }
 
@@ -1912,8 +1913,8 @@ Instruction *InstCombinerImpl::visitFPTrunc(FPTruncInst &FPT) {
       // Do unary FP operation on smaller type.
       // (fptrunc (fabs x)) -> (fabs (fptrunc x))
       Value *InnerTrunc = Builder.CreateFPTrunc(Src, Ty);
-      Function *Overload = Intrinsic::getDeclaration(FPT.getModule(),
-                                                     II->getIntrinsicID(), Ty);
+      Function *Overload = Intrinsic::getOrInsertDeclaration(
+          FPT.getModule(), II->getIntrinsicID(), Ty);
       SmallVector<OperandBundleDef, 1> OpBundles;
       II->getOperandBundlesAsDefs(OpBundles);
       CallInst *NewCI =
@@ -2855,8 +2856,8 @@ Instruction *InstCombinerImpl::visitBitCast(BitCastInst &CI) {
       if (IntrinsicNum != 0) {
         assert(ShufOp0->getType() == SrcTy && "Unexpected shuffle mask");
         assert(match(ShufOp1, m_Undef()) && "Unexpected shuffle op");
-        Function *BswapOrBitreverse =
-            Intrinsic::getDeclaration(CI.getModule(), IntrinsicNum, DestTy);
+        Function *BswapOrBitreverse = Intrinsic::getOrInsertDeclaration(
+            CI.getModule(), IntrinsicNum, DestTy);
         Value *ScalarX = Builder.CreateBitCast(ShufOp0, DestTy);
         return CallInst::Create(BswapOrBitreverse, {ScalarX});
       }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index d1eb84b5ca5c10..7129499e0f8f9d 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -1125,7 +1125,7 @@ static Instruction *processUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B,
   // use the sadd_with_overflow intrinsic to efficiently compute both the
   // result and the overflow bit.
   Type *NewType = IntegerType::get(OrigAdd->getContext(), NewWidth);
-  Function *F = Intrinsic::getDeclaration(
+  Function *F = Intrinsic::getOrInsertDeclaration(
       I.getModule(), Intrinsic::sadd_with_overflow, NewType);
 
   InstCombiner::BuilderTy &Builder = IC.Builder;
@@ -4790,11 +4790,11 @@ Value *InstCombinerImpl::foldMultiplicationOverflowCheck(ICmpInst &I) {
   if (MulHadOtherUses)
     Builder.SetInsertPoint(Mul);
 
-  Function *F = Intrinsic::getDeclaration(I.getModule(),
-                                          Div->getOpcode() == Instruction::UDiv
-                                              ? Intrinsic::umul_with_overflow
-                                              : Intrinsic::smul_with_overflow,
-                                          X->getType());
+  Function *F = Intrinsic::getOrInsertDeclaration(
+      I.getModule(),
+      Div->getOpcode() == Instruction::UDiv ? Intrinsic::umul_with_overflow
+                                            : Intrinsic::smul_with_overflow,
+      X->getType());
   CallInst *Call = Builder.CreateCall(F, {X, Y}, "mul");
 
   // If the multiplication was used elsewhere, to ensure that we don't leave
@@ -6334,7 +6334,7 @@ static Instruction *processUMulZExtIdiom(ICmpInst &I, Value *MulVal,
     MulA = Builder.CreateZExt(A, MulType);
   if (WidthB < MulWidth)
     MulB = Builder.CreateZExt(B, MulType);
-  Function *F = Intrinsic::getDeclaration(
+  Function *F = Intrinsic::getOrInsertDeclaration(
       I.getModule(), Intrinsic::umul_with_overflow, MulType);
   CallInst *Call = Builder.CreateCall(F, {MulA, MulB}, "umul");
   IC.addToWorklist(MulInstr);
@@ -7121,8 +7121,8 @@ static Instruction *foldVectorCmp(CmpInst &Cmp,
     if (auto *I = dyn_cast<Instruction>(V))
       I->copyIRFlags(&Cmp);
     Module *M = Cmp.getModule();
-    Function *F =
-        Intrinsic::getDeclaration(M, Intrinsic::vector_reverse, V->getType());
+    Function *F = Intrinsic::getOrInsertDeclaration(
+        M, Intrinsic::vector_reverse, V->getType());
     return CallInst::Create(F, V);
   };
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 3f780285efe423..358563a5fcd537 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1148,8 +1148,8 @@ static Instruction *foldSelectCtlzToCttz(ICmpInst *ICI, Value *TrueVal,
   if (!match(II->getOperand(0), m_c_And(m_Specific(X), m_Neg(m_Specific(X)))))
     return nullptr;
 
-  Function *F = Intrinsic::getDeclaration(II->getModule(), Intrinsic::cttz,
-                                          II->getType());
+  Function *F = Intrinsic::getOrInsertDeclaration(
+      II->getModule(), Intrinsic::cttz, II->getType());
   return CallInst::Create(F, {X, II->getArgOperand(1)});
 }
 
@@ -2242,8 +2242,8 @@ foldOverflowingAddSubSelect(SelectInst &SI, InstCombiner::BuilderTy &Builder) {
   else
     return nullptr;
 
-  Function *F =
-      Intrinsic::getDeclaration(SI.getModule(), NewIntrinsicID, SI.getType());
+  Function *F = Intrinsic::getOrInsertDeclaration(SI.getModule(),
+                                                  NewIntrinsicID, SI.getType());
   return CallInst::Create(F, {X, Y});
 }
 
@@ -2537,7 +2537,8 @@ static Instruction *foldSelectFunnelShift(SelectInst &Sel,
   // This is a funnel/rotate that avoids shift-by-bitwidth UB in a suboptimal way.
   // Convert to funnel shift intrinsic.
   Intrinsic::ID IID = IsFshl ? Intrinsic::fshl : Intrinsic::fshr;
-  Function *F = Intrinsic::getDeclaration(Sel.getModule(), IID, Sel.getType());
+  Function *F =
+      Intrinsic::getOrInsertDeclaration(Sel.getModule(), IID, Sel.getType());
   ShAmt = Builder.CreateZExt(ShAmt, Sel.getType());
   return CallInst::Create(F, { SV0, SV1, ShAmt });
 }
@@ -2580,8 +2581,8 @@ static Instruction *foldSelectToCopysign(SelectInst &Sel,
   // Canonicalize the magnitude argument as the positive constant since we do
   // not care about its sign.
   Value *MagArg = ConstantFP::get(SelType, abs(*TC));
-  Function *F = Intrinsic::getDeclaration(Sel.getModule(), Intrinsic::copysign,
-                                          Sel.getType());
+  Function *F = Intrinsic::getOrInsertDeclaration(
+      Sel.getModule(), Intrinsic::copysign, Sel.getType());
   return CallInst::Create(F, { MagArg, X });
 }
 
@@ -2600,8 +2601,8 @@ Instruction *InstCombinerImpl::foldVectorSelect(SelectInst &Sel) {
       if (auto *I = dyn_cast<Instruction>(V))
         I->copyIRFlags(&Sel);
       Module *M = Sel.getModule();
-      Function *F =
-          Intrinsic::getDeclaration(M, Intrinsic::vector_reverse, V->getType());
+      Function *F = Intrinsic::getOrInsertDeclaration(
+          M, Intrinsic::vector_reverse, V->getType());
       return CallInst::Create(F, V);
     };
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 3d4461dc1a87f6..8ca705ae1d364d 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -898,7 +898,7 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Instruction *I,
         Value *X;
         if (DemandedMask == 1 && VTy->getScalarSizeInBits() % 2 == 0 &&
             match(II->getArgOperand(0), m_Not(m_Value(X)))) {
-          Function *Ctpop = Intrinsic::getDeclaration(
+          Function *Ctpop = Intrinsic::getOrInsertDeclaration(
               II->getModule(), Intrinsic::ctpop, VTy);
           return InsertNewInstWith(CallInst::Create(Ctpop, {X}), I->getIterator());
         }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index d9b4faff4c004d..d68ae64f08aa90 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -2474,8 +2474,8 @@ static Instruction *foldShuffleOfUnaryOps(ShuffleVectorInst &Shuf,
     if (IsFNeg)
       return UnaryOperator::CreateFNegFMF(NewShuf, S0);
 
-    Function *FAbs = Intrinsic::getDeclaration(Shuf.getModule(),
-                                               Intrinsic::fabs, Shuf.getType());
+    Function *FAbs = Intrinsic::getOrInsertDeclaration(
+        Shuf.getModule(), Intrinsic::fabs, Shuf.getType());
     CallInst *NewF = CallInst::Create(FAbs, {NewShuf});
     NewF->setFastMathFlags(S0->getFastMathFlags());
     return NewF;
@@ -2495,8 +2495,8 @@ static Instruction *foldShuffleOfUnaryOps(ShuffleVectorInst &Shuf,
   if (IsFNeg) {
     NewF = UnaryOperator::CreateFNeg(NewShuf);
   } else {
-    Function *FAbs = Intrinsic::getDeclaration(Shuf.getModule(),
-                                               Intrinsic::fabs, Shuf.getType());
+    Function *FAbs = Intrinsic::getOrInsertDeclaration(
+        Shuf.getModule(), Intrinsic::fabs, Shuf.getType());
     NewF = CallInst::Create(FAbs, {NewShuf});
   }
   NewF->copyIRFlags(S0);
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 1f4a6f793404cf..954c4cf19c2077 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2084,8 +2084,8 @@ Instruction *InstCombinerImpl::foldVectorBinop(BinaryOperator &Inst) {
     if (auto *BO = dyn_cast<BinaryOperator>(V))
       BO->copyIRFlags(&Inst);
     Module *M = Inst.getModule();
-    Function *F =
-        Intrinsic::getDeclaration(M, Intrinsic::vector_reverse, V->getType());
+    Function *F = Intrinsic::getOrInsertDeclaration(
+        M, Intrinsic::vector_reverse, V->getType());
     return CallInst::Create(F, V);
   };
 
@@ -3355,7 +3355,7 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
     if (InvokeInst *II = dyn_cast<InvokeInst>(&MI)) {
       // Replace invoke with a NOP intrinsic to maintain the original CFG
       Module *M = II->getModule();
-      Function *F = Intrinsic::getDeclaration(M, Intrinsic::donothing);
+      Function *F = Intrinsic::getOrInsertDeclaration(M, Intrinsic::donothing);
       InvokeInst::Create(F, II->getNormalDest(), II->getUnwindDest(), {}, "",
                          II->getParent());
     }
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 2ad89b5ba753a5..02d9fab309d83b 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -1109,7 +1109,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
     // alloca. We have a special @llvm.get.dynamic.area.offset intrinsic for
     // this purpose.
     if (!isa<ReturnInst>(InstBefore)) {
-      Function *DynamicAreaOffsetFunc = Intrinsic::getDeclaration(
+      Function *DynamicAreaOffsetFunc = Intrinsic::getOrInsertDeclaration(
           InstBefore->getModule(), Intrinsic::get_dynamic_area_offset,
           {IntptrTy});
 
@@ -1867,7 +1867,7 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
     const ASanAccessInfo AccessInfo(IsWrite, CompileKernel, AccessSizeIndex);
     Module *M = IRB.GetInsertBlock()->getParent()->getParent();
     IRB.CreateCall(
-        Intrinsic::getDeclaration(M, Intrinsic::asan_check_memaccess),
+        Intrinsic::getOrInsertDeclaration(M, Intrinsic::asan_check_memaccess),
         {IRB.CreatePointerCast(Addr, PtrTy),
          ConstantInt::get(Int32Ty, AccessInfo.Packed)});
     return;
diff --git a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
index 618b6fe1aea474..63d580d2b9d512 100644
--- a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
+++ b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
@@ -194,7 +194,7 @@ static bool addBoundsChecking(Function &F, TargetLibraryInfo &TLI,
     IRB.SetInsertPoint(TrapBB);
 
     Intrinsic::ID IntrID = DebugTrapBB ? Intrinsic::ubsantrap : Intrinsic::trap;
-    auto *F = Intrinsic::getDeclaration(Fn->getParent(), IntrID);
+    auto *F = Intrinsic::getOrInsertDeclaration(Fn->getParent(), IntrID);
 
     CallInst *TrapCall;
     if (DebugTrapBB) {
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index cc7f20cffea771..5ec4973ea03d8f 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -1042,14 +1042,14 @@ void HWAddressSanitizer::instrumentMemAccessOutline(Value *Ptr, bool IsWrite,
 
   if (UseFixedShadowIntrinsic) {
     IRB.CreateCall(
-        Intrinsic::getDeclaration(
+        Intrinsic::getOrInsertDeclaration(
             M, UseShortGranules
                    ? Intrinsic::hwasan_check_memaccess_shortgranules_fixedshadow
                    : Intrinsic::hwasan_check_memaccess_fixedshadow),
         {Ptr, ConstantInt::get(Int32Ty, AccessInfo),
          ConstantInt::get(Int64Ty, Mapping.offset())});
   } else {
-    IRB.CreateCall(Intrinsic::getDeclaration(
+    IRB.CreateCall(Intrinsic::getOrInsertDeclaration(
                        M, UseShortGranules
                               ? Intrinsic::hwasan_check_memaccess_shortgranules
                               : Intrinsic::hwasan_check_memaccess),
diff --git a/llvm/lib/Transforms/Instrumentation/KCFI.cpp b/llvm/lib/Transforms/Instrumentation/KCFI.cpp
index 28dc1c02b661ac..bbe0f4c6178192 100644
--- a/llvm/lib/Transforms/Instrumentation/KCFI.cpp
+++ b/llvm/lib/Transforms/Instrumentation/KCFI.cpp
@@ -110,7 +110,8 @@ PreservedAnalyses KCFIPass::run(Function &F, FunctionAnalysisManager &AM) {
     Instruction *ThenTerm =
         SplitBlockAndInsertIfThen(Test, Call, false, VeryUnlikelyWeights);
     Builder.SetInsertPoint(ThenTerm);
-    Builder.CreateCall(Intrinsic::getDeclaration(&M, Intrinsic::debugtrap));
+    Builder.CreateCall(
+        Intrinsic::getOrInsertDeclaration(&M, Intrinsic::debugtrap));
     ++NumKCFIChecks;
   }
 
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 07d667434e0710..19ec97c17f31c6 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -2853,7 +2853,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     Value *S2Conv =
         IRB.CreateSExt(IRB.CreateICmpNE(S2, getCleanShadow(S2)), S2->getType());
     Value *V2 = I.getOperand(2);
-    Function *Intrin = Intrinsic::getDeclaration(
+    Function *Intrin = Intrinsic::getOrInsertDeclaration(
         I.getModule(), I.getIntrinsicID(), S2Conv->getType());
     Value *Shift = IRB.CreateCall(Intrin, {S0, S1, V2});
     setShadow(&I, IRB.CreateOr(Shift, S2Conv));
@@ -3057,7 +3057,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     IRBuilder<> IRB(&I);
     Value *Op = I.getArgOperand(0);
     Type *OpType = Op->getType();
-    Function *BswapFunc = Intrinsic::getDeclaration(
+    Function *BswapFunc = Intrinsic::getOrInsertDeclaration(
         F.getParent(), Intrinsic::bswap, ArrayRef(&OpType, 1));
     setShadow(&I, IRB.CreateCall(BswapFunc, getShadow(Op)));
     setOrigin(&I, getOrigin(Op));
@@ -3287,7 +3287,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       S2_ext = IRB.CreateBitCast(S2_ext, getMMXVectorTy(64));
     }
 
-    Function *ShadowFn = Intrinsic::getDeclaration(
+    Function *ShadowFn = Intrinsic::getOrInsertDeclaration(
         F.getParent(), getSignedPackIntrinsic(I.getIntrinsicID()));
 
     Value *S =
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 10442fa0bb9003..e6e474ed376069 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -919,7 +919,7 @@ void FunctionInstrumenter::instrument() {
     // llvm.instrprof.cover(i8* <name>, i64 <hash>, i32 <num-counters>,
     //                      i32 <index>)
     Builder.CreateCall(
-        Intrinsic::getDeclaration(&M, Intrinsic::instrprof_cover),
+        Intrinsic::getOrInsertDeclaration(&M, Intrinsic::instrprof_cover),
         {NormalizedNamePtr, CFGHash, Builder.getInt32(1), Builder.getInt32(0)});
     return;
   }
@@ -931,7 +931,7 @@ void FunctionInstrumenter::instrument() {
 
   if (IsCtxProf) {
     auto *CSIntrinsic =
-        Intrinsic::getDeclaration(&M, Intrinsic::instrprof_callsite);
+        Intrinsic::getOrInsertDeclaration(&M, Intrinsic::instrprof_callsite);
     // We want to count the instrumentable callsites, then instrument them. This
     // is because the llvm.instrprof.callsite intrinsic has an argument (like
     // the other instrprof intrinsics) capturing the total number of
@@ -972,7 +972,7 @@ void FunctionInstrumenter::instrument() {
     // llvm.instrprof.timestamp(i8* <name>, i64 <hash>, i32 <num-counters>,
     //                          i32 <index>)
     Builder.CreateCall(
-        Intrinsic::getDeclaration(&M, Intrinsic::instrprof_timestamp),
+        Intrinsic::getOrInsertDeclaration(&M, Intrinsic::instrprof_timestamp),
         {NormalizedNamePtr, CFGHash, Builder.getInt32(NumCounters),
          Builder.getInt32(I)});
     I += PGOBlockCoverage ? 8 : 1;
@@ -984,12 +984,12 @@ void FunctionInstrumenter::instrument() {
            "Cannot get the Instrumentation point");
     // llvm.instrprof.increment(i8* <name>, i64 <hash>, i32 <num-counters>,
     //                          i32 <index>)
-    Builder.CreateCall(
-        Intrinsic::getDeclaration(&M, PGOBlockCoverage
-                                          ? Intrinsic::instrprof_cover
-                                          : Intrinsic::instrprof_increment),
-        {NormalizedNamePtr, CFGHash, Builder.getInt32(NumCounters),
-         Builder.getInt32(I++)});
+    Builder.CreateCall(Intrinsic::getOrInsertDeclaration(
+                           &M, PGOBlockCoverage
+                                   ? Intrinsic::instrprof_cover
+                                   : Intrinsic::instrprof_increment),
+                       {NormalizedNamePtr, CFGHash,
+                        Builder.getInt32(NumCounters), Builder.getInt32(I++)});
   }
 
   // Now instrument select instructions:
@@ -1038,7 +1038,8 @@ void FunctionInstrumenter::instrument() {
       SmallVector<OperandBundleDef, 1> OpBundles;
       populateEHOperandBundle(Cand, BlockColors, OpBundles);
       Builder.CreateCall(
-          Intrinsic::getDeclaration(&M, Intrinsic::instrprof_value_profile),
+          Intrinsic::getOrInsertDeclaration(&M,
+                                            Intrinsic::instrprof_value_profile),
           {NormalizedNamePtr, Builder.getInt64(FuncInfo.FunctionHash),
            ToProfile, Builder.getInt32(Kind), Builder.getInt32(SiteIndex++)},
           OpBundles);
@@ -1726,7 +1727,7 @@ void SelectInstVisitor::instrumentOneSelectInst(SelectInst &SI) {
       ConstantExpr::getPointerBitCastOrAddrSpaceCast(
           FuncNameVar, PointerType::get(M->getContext(), 0));
   Builder.CreateCall(
-      Intrinsic::getDeclaration(M, Intrinsic::instrprof_increment_step),
+      Intrinsic::getOrInsertDeclaration(M, Intrinsic::instrprof_increment_step),
       {NormalizedFuncNameVarPtr, Builder.getInt64(FuncHash),
        Builder.getInt32(TotalNumCtrs), Builder.getInt32(*CurCtrIdx), Step});
   ++(*CurCtrIdx);
diff --git a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index db4bf709c9cc9c..719806fdf37f58 100644
--- a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -999,7 +999,7 @@ void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
   if (Options.StackDepth && IsEntryBB && !IsLeafFunc) {
     // Check stack depth.  If it's the deepest so far, record it.
     Module *M = F.getParent();
-    Function *GetFrameAddr = Intrinsic::getDeclaration(
+    Function *GetFrameAddr = Intrinsic::getOrInsertDeclaration(
         M, Intrinsic::frameaddress,
         IRB.getPtrTy(M->getDataLayout().getAllocaAddrSpace()));
     auto FrameAddrPtr =
diff --git a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index 68cf4e55301314..388addfab181a4 100644
--- a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -571,9 +571,10 @@ bool ThreadSanitizer::sanitizeFunction(Function &F,
   // Instrument function entry/exit points if there were instrumented accesses.
   if ((Res || HasCalls) && ClInstrumentFuncEntryExit) {
     InstrumentationIRBuilder IRB(F.getEntryBlock().getFirstNonPHI());
-    Value *ReturnAddress = IRB.CreateCall(
-        Intrinsic::getDeclaration(F.getParent(), Intrinsic::returnaddress),
-        IRB.getInt32(0));
+    Value *ReturnAddress =
+        IRB.CreateCall(Intrinsic::getOrInsertDeclaration(
+                           F.getParent(), Intrinsic::returnaddress),
+                       IRB.getInt32(0));
     IRB.CreateCall(TsanFuncEntry, ReturnAddress);
 
     EscapeEnumerator EE(F, "tsan_cleanup", ClHandleCxxExceptions);
diff --git a/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h b/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
index c11691c613ac78..0dedd0207571bf 100644
--- a/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
+++ b/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
@@ -139,7 +139,7 @@ class ARCRuntimeEntryPoints {
     if (Decl)
       return Decl;
 
-    return Decl = Intrinsic::getDeclaration(TheModule, IntID);
+    return Decl = Intrinsic::getOrInsertDeclaration(TheModule, IntID);
   }
 };
 
diff --git a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
index 60fd2a286119b3..9317e0643079ea 100644
--- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
@@ -380,7 +380,8 @@ bool InferAddressSpacesImpl::rewriteIntrinsicOperands(IntrinsicInst *II,
   case Intrinsic::masked_load: {
     Type *DestTy = II->getType();
     Type *SrcTy = NewV->getType();
-    Function *NewDecl = Intrinsic::getDeclaration(M, IID, {DestTy, SrcTy});
+    Function *NewDecl =
+        Intrinsic::getOrInsertDeclaration(M, IID, {DestTy, SrcTy});
     II->setArgOperand(0, NewV);
     II->setCalledFunction(NewDecl);
     return true;
@@ -391,7 +392,8 @@ bool InferAddressSpacesImpl::rewriteIntrinsicOperands(IntrinsicInst *II,
   case Intrinsic::masked_gather: {
     Type *RetTy = II->getType();
     Type *NewPtrTy = NewV->getType();
-    Function *NewDecl = Intrinsic::getDeclaration(M, IID, {RetTy, NewPtrTy});
+    Function *NewDecl =
+        Intrinsic::getOrInsertDeclaration(M, IID, {RetTy, NewPtrTy});
     II->setArgOperand(0, NewV);
     II->setCalledFunction(NewDecl);
     return true;
@@ -400,16 +402,16 @@ bool InferAddressSpacesImpl::rewriteIntrinsicOperands(IntrinsicInst *II,
   case Intrinsic::masked_scatter: {
     Type *ValueTy = II->getOperand(0)->getType();
     Type *NewPtrTy = NewV->getType();
-    Function *NewDecl =
-        Intrinsic::getDeclaration(M, II->getIntrinsicID(), {ValueTy, NewPtrTy});
+    Function *NewDecl = Intrinsic::getOrInsertDeclaration(
+        M, II->getIntrinsicID(), {ValueTy, NewPtrTy});
     II->setArgOperand(1, NewV);
     II->setCalledFunction(NewDecl);
     return true;
   }
   case Intrinsic::prefetch:
   case Intrinsic::is_constant: {
-    Function *NewDecl =
-        Intrinsic::getDeclaration(M, II->getIntrinsicID(), {NewV->getType()});
+    Function *NewDecl = Intrinsic::getOrInsertDeclaration(
+        M, II->getIntrinsicID(), {NewV->getType()});
     II->setArgOperand(0, NewV);
     II->setCalledFunction(NewDecl);
     return true;
diff --git a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
index d85166e518f1db..4043c0e9a7ddc4 100644
--- a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
@@ -405,7 +405,7 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
     IRBuilder<> Builder(P.InsertPt);
     Module *M = BB->getParent()->getParent();
     Type *I32 = Type::getInt32Ty(BB->getContext());
-    Function *PrefetchFunc = Intrinsic::getDeclaration(
+    Function *PrefetchFunc = Intrinsic::getOrInsertDeclaration(
         M, Intrinsic::prefetch, PrefPtrValue->getType());
     Builder.CreateCall(
         PrefetchFunc,
diff --git a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
index d5e91d3c1decf8..30369ed7c245cf 100644
--- a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
@@ -978,8 +978,8 @@ static bool FlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
     assert(match(Br->getCondition(), m_Zero()) &&
            "Expected branch condition to be false");
     IRBuilder<> Builder(Br);
-    Function *F = Intrinsic::getDeclaration(M, Intrinsic::umul_with_overflow,
-                                            FI.OuterTripCount->getType());
+    Function *F = Intrinsic::getOrInsertDeclaration(
+        M, Intrinsic::umul_with_overflow, FI.OuterTripCount->getType());
     Value *Call = Builder.CreateCall(F, {FI.OuterTripCount, FI.InnerTripCount},
                                      "flatten.mul");
     FI.NewTripCount = Builder.CreateExtractValue(Call, 0, "flatten.tripcount");
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 740e1e39b9ee77..56006d9ae6924a 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -2122,7 +2122,7 @@ static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
   Type *Tys[] = {Val->getType()};
 
   Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
-  Function *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
+  Function *Func = Intrinsic::getOrInsertDeclaration(M, Intrinsic::ctpop, Tys);
   CallInst *CI = IRBuilder.CreateCall(Func, Ops);
   CI->setDebugLoc(DL);
 
@@ -2136,7 +2136,7 @@ static CallInst *createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
   Type *Tys[] = {Val->getType()};
 
   Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
-  Function *Func = Intrinsic::getDeclaration(M, IID, Tys);
+  Function *Func = Intrinsic::getOrInsertDeclaration(M, IID, Tys);
   CallInst *CI = IRBuilder.CreateCall(Func, Ops);
   CI->setDebugLoc(DL);
 
diff --git a/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp b/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp
index a59ecdda1746f9..ce35349376c483 100644
--- a/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp
@@ -44,7 +44,7 @@ static bool lowerGuardIntrinsic(Function &F) {
   if (ToLower.empty())
     return false;
 
-  auto *DeoptIntrinsic = Intrinsic::getDeclaration(
+  auto *DeoptIntrinsic = Intrinsic::getOrInsertDeclaration(
       F.getParent(), Intrinsic::experimental_deoptimize, {F.getReturnType()});
   DeoptIntrinsic->setCallingConv(GuardDecl->getCallingConv());
 
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 0d98e844cf91ea..a4ab288b1bfee8 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -1290,7 +1290,7 @@ class LowerMatrixIntrinsics {
       if (AllowContraction) {
         // Use fmuladd for floating point operations and let the backend decide
         // if that's profitable.
-        Function *FMulAdd = Intrinsic::getDeclaration(
+        Function *FMulAdd = Intrinsic::getOrInsertDeclaration(
             Func.getParent(), Intrinsic::fmuladd, A->getType());
         return Builder.CreateCall(FMulAdd, {A, B, Sum});
       }
diff --git a/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp b/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp
index aea17aa82a88a4..b9f88ba4e0780e 100644
--- a/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp
+++ b/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp
@@ -69,7 +69,7 @@ static bool explicifyGuards(Function &F) {
   if (GuardIntrinsics.empty())
     return false;
 
-  auto *DeoptIntrinsic = Intrinsic::getDeclaration(
+  auto *DeoptIntrinsic = Intrinsic::getOrInsertDeclaration(
       F.getParent(), Intrinsic::experimental_deoptimize, {F.getReturnType()});
   DeoptIntrinsic->setCallingConv(GuardDecl->getCallingConv());
 
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index b568811dcdbcac..557a75e8946dc3 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -1855,8 +1855,8 @@ bool MemCpyOptPass::processMemMove(MemMoveInst *M) {
   // If not, then we know we can transform this.
   Type *ArgTys[3] = {M->getRawDest()->getType(), M->getRawSource()->getType(),
                      M->getLength()->getType()};
-  M->setCalledFunction(
-      Intrinsic::getDeclaration(M->getModule(), Intrinsic::memcpy, ArgTys));
+  M->setCalledFunction(Intrinsic::getOrInsertDeclaration(
+      M->getModule(), Intrinsic::memcpy, ArgTys));
 
   // For MemorySSA nothing really changes (except that memcpy may imply stricter
   // aliasing guarantees).
diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index e3c12c971b9ab0..daf8fa28a71e59 100644
--- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -1525,8 +1525,8 @@ static void CreateGCRelocates(ArrayRef<Value *> LiveVariables,
     if (auto *VT = dyn_cast<VectorType>(Ty))
       NewTy = FixedVectorType::get(NewTy,
                                    cast<FixedVectorType>(VT)->getNumElements());
-    return Intrinsic::getDeclaration(M, Intrinsic::experimental_gc_relocate,
-                                     {NewTy});
+    return Intrinsic::getOrInsertDeclaration(
+        M, Intrinsic::experimental_gc_relocate, {NewTy});
   };
 
   // Lazily populated map from input types to the canonicalized form mentioned
diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
index 72728c0f839e5d..b1e4c7e52d99a0 100644
--- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
@@ -733,7 +733,8 @@ bool ScalarizerVisitor::splitCall(CallInst &CI) {
   ValueVector Res(VS->NumFragments);
   ValueVector ScalarCallOps(NumArgs);
 
-  Function *NewIntrin = Intrinsic::getDeclaration(F->getParent(), ID, Tys);
+  Function *NewIntrin =
+      Intrinsic::getOrInsertDeclaration(F->getParent(), ID, Tys);
   IRBuilder<> Builder(&CI);
 
   // Perform actual scalarization, taking care to preserve any scalar operands.
@@ -756,7 +757,7 @@ bool ScalarizerVisitor::splitCall(CallInst &CI) {
     }
 
     if (IsRemainder)
-      NewIntrin = Intrinsic::getDeclaration(F->getParent(), ID, Tys);
+      NewIntrin = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, Tys);
 
     Res[I] = Builder.CreateCall(NewIntrin, ScalarCallOps,
                                 CI.getName() + ".i" + Twine(I));
diff --git a/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp
index 3cf68e07da5be2..e1dd20478fd55f 100644
--- a/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp
+++ b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp
@@ -225,7 +225,8 @@ struct AssumeBuilderState {
       return nullptr;
     if (!DebugCounter::shouldExecute(BuildAssumeCounter))
       return nullptr;
-    Function *FnAssume = Intrinsic::getDeclaration(M, Intrinsic::assume);
+    Function *FnAssume =
+        Intrinsic::getOrInsertDeclaration(M, Intrinsic::assume);
     LLVMContext &C = M->getContext();
     SmallVector<OperandBundleDef, 8> OpBundle;
     for (auto &MapElem : AssumedKnowledgeMap) {
diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp
index fc03643e3542cc..c6ba85bd9e57d4 100644
--- a/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -425,8 +425,8 @@ PruningFunctionCloner::cloneInstruction(BasicBlock::const_iterator II) {
 
       // Create intrinsic call.
       LLVMContext &Ctx = NewFunc->getContext();
-      Function *IFn =
-          Intrinsic::getDeclaration(NewFunc->getParent(), CIID, TParams);
+      Function *IFn = Intrinsic::getOrInsertDeclaration(NewFunc->getParent(),
+                                                        CIID, TParams);
       SmallVector<Value *, 4> Args;
       unsigned NumOperands = OldInst.getNumOperands();
       if (isa<CallInst>(OldInst))
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index f58448dd9562d5..a090c5ed749205 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -1124,7 +1124,8 @@ static void insertLifetimeMarkersSurroundingCall(
                                             TheCall->getFunction()) &&
              "Input memory not defined in original function");
 
-      Function *Func = Intrinsic::getDeclaration(M, MarkerFunc, Mem->getType());
+      Function *Func =
+          Intrinsic::getOrInsertDeclaration(M, MarkerFunc, Mem->getType());
       auto Marker = CallInst::Create(Func, {NegativeOne, Mem});
       if (InsertBefore)
         Marker->insertBefore(TheCall);
diff --git a/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp b/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
index d12c540f9a4d04..47bb31905d1ac8 100644
--- a/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
+++ b/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
@@ -63,7 +63,7 @@ static void insertCall(Function &CurFn, StringRef Func,
         Func, FunctionType::get(Type::getVoidTy(C), ArgTypes, false));
 
     Instruction *RetAddr = CallInst::Create(
-        Intrinsic::getDeclaration(&M, Intrinsic::returnaddress),
+        Intrinsic::getOrInsertDeclaration(&M, Intrinsic::returnaddress),
         ArrayRef<Value *>(ConstantInt::get(Type::getInt32Ty(C), 0)), "",
         InsertionPt);
     RetAddr->setDebugLoc(DL);
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 671b0d0822a5d9..110fd6de5c6968 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -2090,7 +2090,7 @@ inlineRetainOrClaimRVCalls(CallBase &CB, objcarc::ARCInstKind RVCallKind,
         if (IsUnsafeClaimRV) {
           Builder.SetInsertPoint(II);
           Function *IFn =
-              Intrinsic::getDeclaration(Mod, Intrinsic::objc_release);
+              Intrinsic::getOrInsertDeclaration(Mod, Intrinsic::objc_release);
           Builder.CreateCall(IFn, RetOpnd, "");
         }
         II->eraseFromParent();
@@ -2125,7 +2125,8 @@ inlineRetainOrClaimRVCalls(CallBase &CB, objcarc::ARCInstKind RVCallKind,
       // matching autoreleaseRV or an annotated call in the callee. Emit a call
       // to objc_retain.
       Builder.SetInsertPoint(RI);
-      Function *IFn = Intrinsic::getDeclaration(Mod, Intrinsic::objc_retain);
+      Function *IFn =
+          Intrinsic::getOrInsertDeclaration(Mod, Intrinsic::objc_retain);
       Builder.CreateCall(IFn, RetOpnd, "");
     }
   }
@@ -3021,7 +3022,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
       });
     } else {
       SmallVector<ReturnInst *, 8> NormalReturns;
-      Function *NewDeoptIntrinsic = Intrinsic::getDeclaration(
+      Function *NewDeoptIntrinsic = Intrinsic::getOrInsertDeclaration(
           Caller->getParent(), Intrinsic::experimental_deoptimize,
           {Caller->getReturnType()});
 
diff --git a/llvm/lib/Transforms/Utils/IntegerDivision.cpp b/llvm/lib/Transforms/Utils/IntegerDivision.cpp
index 11956816a6ec3f..e95a7a9ae525ac 100644
--- a/llvm/lib/Transforms/Utils/IntegerDivision.cpp
+++ b/llvm/lib/Transforms/Utils/IntegerDivision.cpp
@@ -157,8 +157,8 @@ static Value *generateUnsignedDivisionCode(Value *Dividend, Value *Divisor,
 
   BasicBlock *IBB = Builder.GetInsertBlock();
   Function *F = IBB->getParent();
-  Function *CTLZ = Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz,
-                                             DivTy);
+  Function *CTLZ =
+      Intrinsic::getOrInsertDeclaration(F->getParent(), Intrinsic::ctlz, DivTy);
 
   // Our CFG is going to look like:
   // +---------------------+
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index f3b8623ebb0f8f..06813bac7c781f 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -4141,7 +4141,8 @@ bool llvm::recognizeBSwapOrBitReverseIdiom(
   else
     return false;
 
-  Function *F = Intrinsic::getDeclaration(I->getModule(), Intrin, DemandedTy);
+  Function *F =
+      Intrinsic::getOrInsertDeclaration(I->getModule(), Intrin, DemandedTy);
   Value *Provider = Res->Provider;
 
   // We may need to truncate the provider.
diff --git a/llvm/lib/Transforms/Utils/LowerGlobalDtors.cpp b/llvm/lib/Transforms/Utils/LowerGlobalDtors.cpp
index 55f9400d93d79b..cd79600657032e 100644
--- a/llvm/lib/Transforms/Utils/LowerGlobalDtors.cpp
+++ b/llvm/lib/Transforms/Utils/LowerGlobalDtors.cpp
@@ -215,8 +215,8 @@ static bool runImpl(Module &M) {
       // If `__cxa_atexit` hits out-of-memory, trap, so that we don't misbehave.
       // This should be very rare, because if the process is running out of
       // memory before main has even started, something is wrong.
-      CallInst::Create(Intrinsic::getDeclaration(&M, Intrinsic::trap), "",
-                       FailBB);
+      CallInst::Create(Intrinsic::getOrInsertDeclaration(&M, Intrinsic::trap),
+                       "", FailBB);
       new UnreachableInst(C, FailBB);
 
       ReturnInst::Create(C, RetBB);
diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
index 1cb1a7b396badc..77abf160dc70f9 100644
--- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
+++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
@@ -268,7 +268,7 @@ bool isLifetimeIntrinsic(Value *V) {
 
 Value *readRegister(IRBuilder<> &IRB, StringRef Name) {
   Module *M = IRB.GetInsertBlock()->getParent()->getParent();
-  Function *ReadRegister = Intrinsic::getDeclaration(
+  Function *ReadRegister = Intrinsic::getOrInsertDeclaration(
       M, Intrinsic::read_register, IRB.getIntPtrTy(M->getDataLayout()));
   MDNode *MD =
       MDNode::get(M->getContext(), {MDString::get(M->getContext(), Name)});
@@ -287,7 +287,7 @@ Value *getPC(const Triple &TargetTriple, IRBuilder<> &IRB) {
 Value *getFP(IRBuilder<> &IRB) {
   Function *F = IRB.GetInsertBlock()->getParent();
   Module *M = F->getParent();
-  auto *GetStackPointerFn = Intrinsic::getDeclaration(
+  auto *GetStackPointerFn = Intrinsic::getOrInsertDeclaration(
       M, Intrinsic::frameaddress,
       IRB.getPtrTy(M->getDataLayout().getAllocaAddrSpace()));
   return IRB.CreatePtrToInt(
@@ -301,7 +301,7 @@ Value *getAndroidSlotPtr(IRBuilder<> &IRB, int Slot) {
   // Android provides a fixed TLS slot for sanitizers. See TLS_SLOT_SANITIZER
   // in Bionic's libc/private/bionic_tls.h.
   Function *ThreadPointerFunc =
-      Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
+      Intrinsic::getOrInsertDeclaration(M, Intrinsic::thread_pointer);
   return IRB.CreateConstGEP1_32(IRB.getInt8Ty(),
                                 IRB.CreateCall(ThreadPointerFunc), 8 * Slot);
 }
diff --git a/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
index 186e17e166ba3d..2415118cad6fb3 100644
--- a/llvm/lib/Transforms/Utils/PredicateInfo.cpp
+++ b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
@@ -559,7 +559,7 @@ Value *PredicateInfoBuilder::materializeStack(unsigned int &Counter,
     if (isa<PredicateWithEdge>(ValInfo)) {
       IRBuilder<> B(getBranchTerminator(ValInfo));
       auto NumDecls = F.getParent()->getNumNamedValues();
-      Function *IF = Intrinsic::getDeclaration(
+      Function *IF = Intrinsic::getOrInsertDeclaration(
           F.getParent(), Intrinsic::ssa_copy, Op->getType());
       if (NumDecls != F.getParent()->getNumNamedValues())
         PI.CreatedDeclarations.insert(IF);
@@ -575,7 +575,7 @@ Value *PredicateInfoBuilder::materializeStack(unsigned int &Counter,
       // directly before it, assume(i1 true) is not a useful fact.
       IRBuilder<> B(PAssume->AssumeInst->getNextNode());
       auto NumDecls = F.getParent()->getNumNamedValues();
-      Function *IF = Intrinsic::getDeclaration(
+      Function *IF = Intrinsic::getOrInsertDeclaration(
           F.getParent(), Intrinsic::ssa_copy, Op->getType());
       if (NumDecls != F.getParent()->getNumNamedValues())
         PI.CreatedDeclarations.insert(IF);
diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index 1b7912fdf5e304..656bb1ebd1161e 100644
--- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -444,7 +444,7 @@ struct PromoteMem2Reg {
 /// Given a LoadInst LI this adds assume(LI != null) after it.
 static void addAssumeNonNull(AssumptionCache *AC, LoadInst *LI) {
   Function *AssumeIntrinsic =
-      Intrinsic::getDeclaration(LI->getModule(), Intrinsic::assume);
+      Intrinsic::getOrInsertDeclaration(LI->getModule(), Intrinsic::assume);
   ICmpInst *LoadNotNull = new ICmpInst(ICmpInst::ICMP_NE, LI,
                                        Constant::getNullValue(LI->getType()));
   LoadNotNull->insertAfter(LI);
diff --git a/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp b/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp
index 6e84965370b248..2700b4307308cb 100644
--- a/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp
+++ b/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp
@@ -151,7 +151,7 @@ static void convertToRelLookupTable(GlobalVariable &LookupTable) {
   // GEP might not be immediately followed by a LOAD, like it can be hoisted
   // outside the loop or another instruction might be inserted them in between.
   Builder.SetInsertPoint(Load);
-  Function *LoadRelIntrinsic = llvm::Intrinsic::getDeclaration(
+  Function *LoadRelIntrinsic = llvm::Intrinsic::getOrInsertDeclaration(
       &M, Intrinsic::load_relative, {Index->getType()});
 
   // Create a call to load.relative intrinsic that computes the target address
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index 1ff3cd78aa9877..de1864ef5b8d9b 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -2134,8 +2134,8 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR,
       MulV = TruncTripCount;
       OfMul = ConstantInt::getFalse(MulV->getContext());
     } else {
-      auto *MulF = Intrinsic::getDeclaration(Loc->getModule(),
-                                             Intrinsic::umul_with_overflow, Ty);
+      auto *MulF = Intrinsic::getOrInsertDeclaration(
+          Loc->getModule(), Intrinsic::umul_with_overflow, Ty);
       CallInst *Mul =
           Builder.CreateCall(MulF, {AbsStep, TruncTripCount}, "mul");
       MulV = Builder.CreateExtractValue(Mul, 0, "mul.result");
diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index e06ebb691d511c..db2acb9eed0938 100644
--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -1960,7 +1960,7 @@ static Value *optimizeDoubleFP(CallInst *CI, IRBuilderBase &B,
   if (IsIntrinsic) {
     Module *M = CI->getModule();
     Intrinsic::ID IID = CalleeFn->getIntrinsicID();
-    Function *Fn = Intrinsic::getDeclaration(M, IID, B.getFloatTy());
+    Function *Fn = Intrinsic::getOrInsertDeclaration(M, IID, B.getFloatTy());
     R = isBinary ? B.CreateCall(Fn, V) : B.CreateCall(Fn, V[0]);
   } else {
     AttributeList CalleeAttrs = CalleeFn->getAttributes();
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e2958c49b8ca9f..5c164075e83259 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -15079,7 +15079,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
                          false /*HasGlobalPred*/);
         CF = VFDatabase(*CI).getVectorizedFunction(Shape);
       } else {
-        CF = Intrinsic::getDeclaration(F->getParent(), ID, TysForDecl);
+        CF = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, TysForDecl);
       }
 
       SmallVector<OperandBundleDef, 1> OpBundles;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index ba94cd29587664..2948ecc580edc0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -984,7 +984,7 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
   // Use vector version of the intrinsic.
   Module *M = State.Builder.GetInsertBlock()->getModule();
   Function *VectorF =
-      Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl);
+      Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl);
   assert(VectorF && "Can't retrieve vector intrinsic.");
 
   auto *CI = cast_or_null<CallInst>(getUnderlyingValue());
diff --git a/llvm/tools/llvm-reduce/deltas/ReduceOpcodes.cpp b/llvm/tools/llvm-reduce/deltas/ReduceOpcodes.cpp
index fb8729c36a6f2d..0e2a6decfbc9d5 100644
--- a/llvm/tools/llvm-reduce/deltas/ReduceOpcodes.cpp
+++ b/llvm/tools/llvm-reduce/deltas/ReduceOpcodes.cpp
@@ -30,7 +30,7 @@ static bool shouldIgnoreArgument(const Value *V) {
 static Value *replaceIntrinsic(Module &M, IntrinsicInst *II,
                                Intrinsic::ID NewIID,
                                ArrayRef<Type *> Tys = {}) {
-  Function *NewFunc = Intrinsic::getDeclaration(&M, NewIID, Tys);
+  Function *NewFunc = Intrinsic::getOrInsertDeclaration(&M, NewIID, Tys);
   II->setCalledFunction(NewFunc);
   return II;
 }
diff --git a/llvm/unittests/Analysis/AssumeBundleQueriesTest.cpp b/llvm/unittests/Analysis/AssumeBundleQueriesTest.cpp
index 6437e0c9491f7f..8ad15ca41510f2 100644
--- a/llvm/unittests/Analysis/AssumeBundleQueriesTest.cpp
+++ b/llvm/unittests/Analysis/AssumeBundleQueriesTest.cpp
@@ -430,7 +430,8 @@ static void RunRandTest(uint64_t Seed, int Size, int MinCount, int MaxCount,
   BB->insertInto(F);
   Instruction *Ret = ReturnInst::Create(C);
   Ret->insertInto(BB, BB->begin());
-  Function *FnAssume = Intrinsic::getDeclaration(Mod.get(), Intrinsic::assume);
+  Function *FnAssume =
+      Intrinsic::getOrInsertDeclaration(Mod.get(), Intrinsic::assume);
 
   std::vector<Argument *> ShuffledArgs;
   BitVector HasArg;
diff --git a/llvm/unittests/Analysis/MemorySSATest.cpp b/llvm/unittests/Analysis/MemorySSATest.cpp
index 9e6c517242a269..81784bb2360975 100644
--- a/llvm/unittests/Analysis/MemorySSATest.cpp
+++ b/llvm/unittests/Analysis/MemorySSATest.cpp
@@ -1120,7 +1120,7 @@ TEST_F(MemorySSATest, LifetimeMarkersAreClobbers) {
   B.CreateStore(B.getInt8(0), Bar);
 
   auto GetLifetimeIntrinsic = [&](Intrinsic::ID ID) {
-    return Intrinsic::getDeclaration(&M, ID, {Foo->getType()});
+    return Intrinsic::getOrInsertDeclaration(&M, ID, {Foo->getType()});
   };
 
   B.CreateCall(GetLifetimeIntrinsic(Intrinsic::lifetime_end),
diff --git a/llvm/unittests/Analysis/ValueTrackingTest.cpp b/llvm/unittests/Analysis/ValueTrackingTest.cpp
index 77d966155dceff..0145ee70a14c17 100644
--- a/llvm/unittests/Analysis/ValueTrackingTest.cpp
+++ b/llvm/unittests/Analysis/ValueTrackingTest.cpp
@@ -2481,8 +2481,8 @@ TEST_F(ComputeKnownBitsTest, ComputeKnownBitsAddWithRange) {
 TEST_F(ComputeKnownBitsTest, ComputeKnownBitsUnknownVScale) {
   Module M("", Context);
   IRBuilder<> Builder(Context);
-  Function *TheFn =
-      Intrinsic::getDeclaration(&M, Intrinsic::vscale, {Builder.getInt32Ty()});
+  Function *TheFn = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::vscale,
+                                                      {Builder.getInt32Ty()});
   CallInst *CI = Builder.CreateCall(TheFn, {}, {}, "");
 
   KnownBits Known = computeKnownBits(CI, M.getDataLayout(), /* Depth */ 0);
diff --git a/llvm/unittests/IR/BasicBlockTest.cpp b/llvm/unittests/IR/BasicBlockTest.cpp
index eea2746a352aa6..88ac6611742ce9 100644
--- a/llvm/unittests/IR/BasicBlockTest.cpp
+++ b/llvm/unittests/IR/BasicBlockTest.cpp
@@ -109,8 +109,10 @@ TEST(BasicBlockTest, TestInstructionsWithoutDebug) {
   Argument *V = new Argument(Type::getInt32Ty(Ctx));
   Function *F = Function::Create(FT, Function::ExternalLinkage, "", M);
 
-  Function *DbgDeclare = Intrinsic::getDeclaration(M, Intrinsic::dbg_declare);
-  Function *DbgValue = Intrinsic::getDeclaration(M, Intrinsic::dbg_value);
+  Function *DbgDeclare =
+      Intrinsic::getOrInsertDeclaration(M, Intrinsic::dbg_declare);
+  Function *DbgValue =
+      Intrinsic::getOrInsertDeclaration(M, Intrinsic::dbg_value);
   Value *DIV = MetadataAsValue::get(Ctx, (Metadata *)nullptr);
   SmallVector<Value *, 3> Args = {DIV, DIV, DIV};
 
@@ -174,7 +176,7 @@ class InstrOrderInvalidationTest : public ::testing::Test {
 protected:
   void SetUp() override {
     M.reset(new Module("MyModule", Ctx));
-    Nop = Intrinsic::getDeclaration(M.get(), Intrinsic::donothing);
+    Nop = Intrinsic::getOrInsertDeclaration(M.get(), Intrinsic::donothing);
     FunctionType *FT = FunctionType::get(Type::getVoidTy(Ctx), {}, false);
     Function *F = Function::Create(FT, Function::ExternalLinkage, "foo", *M);
     BB = BasicBlock::Create(Ctx, "entry", F);
diff --git a/llvm/unittests/IR/DebugInfoTest.cpp b/llvm/unittests/IR/DebugInfoTest.cpp
index 953df224e84dcb..ea20c87d6b09b4 100644
--- a/llvm/unittests/IR/DebugInfoTest.cpp
+++ b/llvm/unittests/IR/DebugInfoTest.cpp
@@ -693,7 +693,8 @@ TEST(IRBuilder, GetSetInsertionPointWithEmptyBasicBlock) {
   std::unique_ptr<BasicBlock> BB(BasicBlock::Create(C, "start"));
   Module *M = new Module("module", C);
   IRBuilder<> Builder(BB.get());
-  Function *DbgDeclare = Intrinsic::getDeclaration(M, Intrinsic::dbg_declare);
+  Function *DbgDeclare =
+      Intrinsic::getOrInsertDeclaration(M, Intrinsic::dbg_declare);
   Value *DIV = MetadataAsValue::get(C, (Metadata *)nullptr);
   SmallVector<Value *, 3> Args = {DIV, DIV, DIV};
   Builder.CreateCall(DbgDeclare, Args);
diff --git a/llvm/unittests/IR/IRBuilderTest.cpp b/llvm/unittests/IR/IRBuilderTest.cpp
index d5239f21147cdb..690af62d18020d 100644
--- a/llvm/unittests/IR/IRBuilderTest.cpp
+++ b/llvm/unittests/IR/IRBuilderTest.cpp
@@ -413,8 +413,9 @@ TEST_F(IRBuilderTest, ConstrainedFPIntrinsics) {
 
   Builder.setDefaultConstrainedExcept(fp::ebStrict);
   Builder.setDefaultConstrainedRounding(RoundingMode::TowardZero);
-  Function *Fn = Intrinsic::getDeclaration(M.get(),
-      Intrinsic::experimental_constrained_roundeven, { Type::getDoubleTy(Ctx) });
+  Function *Fn = Intrinsic::getOrInsertDeclaration(
+      M.get(), Intrinsic::experimental_constrained_roundeven,
+      {Type::getDoubleTy(Ctx)});
   V = Builder.CreateConstrainedFPCall(Fn, { VDouble });
   CII = cast<ConstrainedFPIntrinsic>(V);
   EXPECT_EQ(Intrinsic::experimental_constrained_roundeven, CII->getIntrinsicID());
diff --git a/llvm/unittests/IR/IntrinsicsTest.cpp b/llvm/unittests/IR/IntrinsicsTest.cpp
index 0c4af28a2ab57b..7fe0bd79b80a60 100644
--- a/llvm/unittests/IR/IntrinsicsTest.cpp
+++ b/llvm/unittests/IR/IntrinsicsTest.cpp
@@ -50,7 +50,7 @@ class IntrinsicsTest : public ::testing::Test {
   Instruction *makeIntrinsic(Intrinsic::ID ID) const {
     IRBuilder<> Builder(BB);
     SmallVector<Value *, 4> ProcessedArgs;
-    auto *Decl = Intrinsic::getDeclaration(M.get(), ID);
+    auto *Decl = Intrinsic::getOrInsertDeclaration(M.get(), ID);
     for (auto *Ty : Decl->getFunctionType()->params()) {
       auto *Val = Constant::getNullValue(Ty);
       ProcessedArgs.push_back(Val);
diff --git a/llvm/unittests/IR/PatternMatch.cpp b/llvm/unittests/IR/PatternMatch.cpp
index 13f121a2b9c7dd..7dc4b9f448d386 100644
--- a/llvm/unittests/IR/PatternMatch.cpp
+++ b/llvm/unittests/IR/PatternMatch.cpp
@@ -1766,7 +1766,7 @@ TEST_F(PatternMatchTest, IntrinsicMatcher) {
   Value *Ops[] = {Name, Hash, Num, Index, Step};
   Module *M = BB->getParent()->getParent();
   Function *TheFn =
-      Intrinsic::getDeclaration(M, Intrinsic::instrprof_increment_step);
+      Intrinsic::getOrInsertDeclaration(M, Intrinsic::instrprof_increment_step);
 
   Value *Intrinsic5 = CallInst::Create(TheFn, Ops, "", BB);
 
diff --git a/llvm/unittests/IR/VPIntrinsicTest.cpp b/llvm/unittests/IR/VPIntrinsicTest.cpp
index 925a69bafa07ef..d6ad7599ce4610 100644
--- a/llvm/unittests/IR/VPIntrinsicTest.cpp
+++ b/llvm/unittests/IR/VPIntrinsicTest.cpp
@@ -420,7 +420,7 @@ TEST_F(VPIntrinsicTest, VPToNonPredIntrinsicRoundTrip) {
   ASSERT_TRUE(IsFullTrip);
 }
 
-/// Check that VPIntrinsic::getDeclarationForParams works.
+/// Check that VPIntrinsic::getOrInsertDeclarationForParams works.
 TEST_F(VPIntrinsicTest, VPIntrinsicDeclarationForParams) {
   std::unique_ptr<Module> M = createVPDeclarationModule();
   assert(M);
@@ -436,7 +436,7 @@ TEST_F(VPIntrinsicTest, VPIntrinsicDeclarationForParams) {
       Values.push_back(UndefValue::get(ParamTy));
 
     ASSERT_NE(F.getIntrinsicID(), Intrinsic::not_intrinsic);
-    auto *NewDecl = VPIntrinsic::getDeclarationForParams(
+    auto *NewDecl = VPIntrinsic::getOrInsertDeclarationForParams(
         OutM.get(), F.getIntrinsicID(), FuncTy->getReturnType(), Values);
     ASSERT_TRUE(NewDecl);
 
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index 372c5aaea59382..376b00224eb574 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -1195,7 +1195,8 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
     // Test for a call to a function without side-effects.
     LLVMContext C;
     Module M("", C);
-    Function *TheFn = Intrinsic::getDeclaration(&M, Intrinsic::thread_pointer);
+    Function *TheFn =
+        Intrinsic::getOrInsertDeclaration(&M, Intrinsic::thread_pointer);
 
     auto *Call = CallInst::Create(TheFn->getFunctionType(), TheFn);
     VPValue Op1;
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
index 5031426033aea1..448a171cf3e412 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
@@ -347,7 +347,7 @@ class LLVM_ConstrainedIntr<string mnem, int numArgs,
                     [&args](unsigned index) { return args[index]->getType(); });
     llvm::Module *module = builder.GetInsertBlock()->getModule();
     llvm::Function *callee =
-      llvm::Intrinsic::getDeclaration(module,
+      llvm::Intrinsic::getOrInsertDeclaration(module,
         llvm::Intrinsic::experimental_constrained_}] #
     mnem # [{, overloadedTypes); }] #
     !cond(!gt(hasRoundingMode, 0) : [{
@@ -541,7 +541,7 @@ class LLVM_DbgIntrOp<string name, string argName, list<Trait> traits = []>
     llvm::Module *module = builder.GetInsertBlock()->getModule();
     llvm::LLVMContext &ctx = module->getContext();
     llvm::Function *fn =
-      llvm::Intrinsic::getDeclaration(module, llvm::Intrinsic::}]
+      llvm::Intrinsic::getOrInsertDeclaration(module, llvm::Intrinsic::}]
        # !subst(".", "_", name) # [{);
     builder.CreateCall(fn, {
         llvm::MetadataAsValue::get(ctx,
@@ -594,7 +594,7 @@ def LLVM_DbgLabelOp : LLVM_IntrOp<"dbg.label", [], [], [], 0> {
     llvm::Module *module = builder.GetInsertBlock()->getModule();
     llvm::LLVMContext &ctx = module->getContext();
     llvm::Function *fn =
-      llvm::Intrinsic::getDeclaration(module, llvm::Intrinsic::dbg_label);
+      llvm::Intrinsic::getOrInsertDeclaration(module, llvm::Intrinsic::dbg_label);
     builder.CreateCall(fn, {
         llvm::MetadataAsValue::get(ctx, moduleTranslation.translateDebugInfo($label))
       });
diff --git a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp
index 46b7b0a473c692..a8595d14ccf2e5 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp
@@ -99,7 +99,8 @@ getOverloadedDeclaration(CallIntrinsicOp op, llvm::Intrinsic::ID id,
   }
 
   ArrayRef<llvm::Type *> overloadedArgTysRef = overloadedArgTys;
-  return llvm::Intrinsic::getDeclaration(module, id, overloadedArgTysRef);
+  return llvm::Intrinsic::getOrInsertDeclaration(module, id,
+                                                 overloadedArgTysRef);
 }
 
 static llvm::OperandBundleDef
@@ -143,7 +144,7 @@ convertCallLLVMIntrinsicOp(CallIntrinsicOp op, llvm::IRBuilderBase &builder,
       return failure();
     fn = *fnOrFailure;
   } else {
-    fn = llvm::Intrinsic::getDeclaration(module, id, {});
+    fn = llvm::Intrinsic::getOrInsertDeclaration(module, id, {});
   }
 
   // Check the result type of the call.
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index a5de90160c4145..add0a31c114f8d 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -839,7 +839,8 @@ llvm::CallInst *mlir::LLVM::detail::createIntrinsicCall(
     llvm::IRBuilderBase &builder, llvm::Intrinsic::ID intrinsic,
     ArrayRef<llvm::Value *> args, ArrayRef<llvm::Type *> tys) {
   llvm::Module *module = builder.GetInsertBlock()->getModule();
-  llvm::Function *fn = llvm::Intrinsic::getDeclaration(module, intrinsic, tys);
+  llvm::Function *fn =
+      llvm::Intrinsic::getOrInsertDeclaration(module, intrinsic, tys);
   return builder.CreateCall(fn, args);
 }
 
@@ -886,8 +887,8 @@ llvm::CallInst *mlir::LLVM::detail::createIntrinsicCall(
   for (unsigned overloadedOperandIdx : overloadedOperands)
     overloadedTypes.push_back(args[overloadedOperandIdx]->getType());
   llvm::Module *module = builder.GetInsertBlock()->getModule();
-  llvm::Function *llvmIntr =
-      llvm::Intrinsic::getDeclaration(module, intrinsic, overloadedTypes);
+  llvm::Function *llvmIntr = llvm::Intrinsic::getOrInsertDeclaration(
+      module, intrinsic, overloadedTypes);
 
   return builder.CreateCall(llvmIntr, args);
 }
diff --git a/polly/lib/CodeGen/IslExprBuilder.cpp b/polly/lib/CodeGen/IslExprBuilder.cpp
index aaafac14bf8065..1688c41c624b24 100644
--- a/polly/lib/CodeGen/IslExprBuilder.cpp
+++ b/polly/lib/CodeGen/IslExprBuilder.cpp
@@ -129,16 +129,16 @@ Value *IslExprBuilder::createBinOp(BinaryOperator::BinaryOps Opc, Value *LHS,
   Module *M = Builder.GetInsertBlock()->getModule();
   switch (Opc) {
   case Instruction::Add:
-    F = Intrinsic::getDeclaration(M, Intrinsic::sadd_with_overflow,
-                                  {LHS->getType()});
+    F = Intrinsic::getOrInsertDeclaration(M, Intrinsic::sadd_with_overflow,
+                                          {LHS->getType()});
     break;
   case Instruction::Sub:
-    F = Intrinsic::getDeclaration(M, Intrinsic::ssub_with_overflow,
-                                  {LHS->getType()});
+    F = Intrinsic::getOrInsertDeclaration(M, Intrinsic::ssub_with_overflow,
+                                          {LHS->getType()});
     break;
   case Instruction::Mul:
-    F = Intrinsic::getDeclaration(M, Intrinsic::smul_with_overflow,
-                                  {LHS->getType()});
+    F = Intrinsic::getOrInsertDeclaration(M, Intrinsic::smul_with_overflow,
+                                          {LHS->getType()});
     break;
   default:
     llvm_unreachable("No overflow intrinsic for binary operator found!");
diff --git a/polly/lib/CodeGen/PerfMonitor.cpp b/polly/lib/CodeGen/PerfMonitor.cpp
index 3cad8537f3ee19..1a791614685443 100644
--- a/polly/lib/CodeGen/PerfMonitor.cpp
+++ b/polly/lib/CodeGen/PerfMonitor.cpp
@@ -59,7 +59,7 @@ void PerfMonitor::addToGlobalConstructors(Function *Fn) {
 }
 
 Function *PerfMonitor::getRDTSCP() {
-  return Intrinsic::getDeclaration(M, Intrinsic::x86_rdtscp);
+  return Intrinsic::getOrInsertDeclaration(M, Intrinsic::x86_rdtscp);
 }
 
 PerfMonitor::PerfMonitor(const Scop &S, Module *M)

From c84f75966af79a381e27e6ffc9481c1fae2fcb4f Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Fri, 11 Oct 2024 05:38:17 -0700
Subject: [PATCH 161/177] [libc] Fix compilation of new trig functions
 (#111987)

---
 libc/src/math/generic/cos.cpp                         | 2 +-
 libc/src/math/generic/range_reduction_double_common.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/libc/src/math/generic/cos.cpp b/libc/src/math/generic/cos.cpp
index 923ea96852d889..568b1254c6f02b 100644
--- a/libc/src/math/generic/cos.cpp
+++ b/libc/src/math/generic/cos.cpp
@@ -93,7 +93,7 @@ LLVM_LIBC_FUNCTION(double, cos, (double x)) {
     }
     return ans;
   };
-  DoubleDouble sin_k = get_idx_dd(k + 128);
+  DoubleDouble msin_k = get_idx_dd(k + 128);
   DoubleDouble cos_k = get_idx_dd(k + 64);
 #else
   // Fast look up version, but needs 256-entry table.
diff --git a/libc/src/math/generic/range_reduction_double_common.h b/libc/src/math/generic/range_reduction_double_common.h
index e23bbff144bee8..bcab82f6c9c3a8 100644
--- a/libc/src/math/generic/range_reduction_double_common.h
+++ b/libc/src/math/generic/range_reduction_double_common.h
@@ -278,6 +278,7 @@ struct LargeRangeReduction {
   DoubleDouble y_mid;
 };
 
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 static Float128 range_reduction_small_f128(double x) {
   constexpr Float128 PI_OVER_128_F128 = {
       Sign::POS, -133, 0xc90f'daa2'2168'c234'c4c6'628b'80dc'1cd1_u128};
@@ -300,7 +301,6 @@ static Float128 range_reduction_small_f128(double x) {
   return fputil::quick_mul(y, PI_OVER_128_F128);
 }
 
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 static constexpr Float128 SIN_K_PI_OVER_128_F128[65] = {
     {Sign::POS, 0, 0},
     {Sign::POS, -133, 0xc90a'afbd'1b33'efc9'c539'edcb'fda0'cf2c_u128},

From 26b832a9ec03d0a35baaf00d81f607004fe2a8cf Mon Sep 17 00:00:00 2001
From: Daniel Mokeev <mokeev.gh@gmail.com>
Date: Fri, 11 Oct 2024 14:41:47 +0200
Subject: [PATCH 162/177] [RISCV] Add DAG combine to turn (sub (shl X, 8-Y),
 (shr X, Y)) into orc.b (#111828)

This patch generalizes the DAG combine for `(sub (shl X, 8), X) =>
(orc.b X)`
into the more general form of `(sub (shl X, 8 - Y), (srl X, Y)) =>
(orc.b X)`.

Alive2 generalized proof: https://alive2.llvm.org/ce/z/dFcf_n
Related issue: https://github.com/llvm/llvm-project/issues/96595
Related PR: https://github.com/llvm/llvm-project/pull/96680
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp |  44 ++-
 llvm/test/CodeGen/RISCV/orc-b-patterns.ll   | 372 ++++++++++++++++++++
 2 files changed, 408 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/orc-b-patterns.ll

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 1f9fc984515cf6..e71c8c3dc1c759 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -13569,8 +13569,10 @@ static SDValue combineSubOfBoolean(SDNode *N, SelectionDAG &DAG) {
   return DAG.getNode(ISD::ADD, DL, VT, NewLHS, NewRHS);
 }
 
-// Looks for (sub (shl X, 8), X) where only bits 8, 16, 24, 32, etc. of X are
-// non-zero. Replace with orc.b.
+// Looks for (sub (shl X, 8-Y), (shr X, Y)) where the Y-th bit in each byte is
+// potentially set. It is fine for Y to be 0, meaning that (sub (shl X, 8), X)
+// is also valid. Replace with (orc.b X). For example, 0b0000_1000_0000_1000 is
+// valid with Y=3, while 0b0000_1000_0000_0100 is not.
 static SDValue combineSubShiftToOrcB(SDNode *N, SelectionDAG &DAG,
                                      const RISCVSubtarget &Subtarget) {
   if (!Subtarget.hasStdExtZbb())
@@ -13584,18 +13586,44 @@ static SDValue combineSubShiftToOrcB(SDNode *N, SelectionDAG &DAG,
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
 
-  if (N0.getOpcode() != ISD::SHL || N0.getOperand(0) != N1 || !N0.hasOneUse())
+  if (N0->getOpcode() != ISD::SHL)
     return SDValue();
 
-  auto *ShAmtC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
-  if (!ShAmtC || ShAmtC->getZExtValue() != 8)
+  auto *ShAmtCLeft = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+  if (!ShAmtCLeft)
     return SDValue();
+  unsigned ShiftedAmount = 8 - ShAmtCLeft->getZExtValue();
 
-  APInt Mask = APInt::getSplat(VT.getSizeInBits(), APInt(8, 0xfe));
-  if (!DAG.MaskedValueIsZero(N1, Mask))
+  if (ShiftedAmount >= 8)
     return SDValue();
 
-  return DAG.getNode(RISCVISD::ORC_B, SDLoc(N), VT, N1);
+  SDValue LeftShiftOperand = N0->getOperand(0);
+  SDValue RightShiftOperand = N1;
+
+  if (ShiftedAmount != 0) { // Right operand must be a right shift.
+    if (N1->getOpcode() != ISD::SRL)
+      return SDValue();
+    auto *ShAmtCRight = dyn_cast<ConstantSDNode>(N1.getOperand(1));
+    if (!ShAmtCRight || ShAmtCRight->getZExtValue() != ShiftedAmount)
+      return SDValue();
+    RightShiftOperand = N1.getOperand(0);
+  }
+
+  // At least one shift should have a single use.
+  if (!N0.hasOneUse() && (ShiftedAmount == 0 || !N1.hasOneUse()))
+    return SDValue();
+
+  if (LeftShiftOperand != RightShiftOperand)
+    return SDValue();
+
+  APInt Mask = APInt::getSplat(VT.getSizeInBits(), APInt(8, 0x1));
+  Mask <<= ShiftedAmount;
+  // Check that X has indeed the right shape (only the Y-th bit can be set in
+  // every byte).
+  if (!DAG.MaskedValueIsZero(LeftShiftOperand, ~Mask))
+    return SDValue();
+
+  return DAG.getNode(RISCVISD::ORC_B, SDLoc(N), VT, LeftShiftOperand);
 }
 
 static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG,
diff --git a/llvm/test/CodeGen/RISCV/orc-b-patterns.ll b/llvm/test/CodeGen/RISCV/orc-b-patterns.ll
new file mode 100644
index 00000000000000..184e66c14b33fc
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/orc-b-patterns.ll
@@ -0,0 +1,372 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefixes=CHECK,RV32I
+; RUN: llc -mtriple=riscv32 -mattr=+zbb -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefixes=CHECK,RV32ZBB
+
+define i32 @orc_b_i32_mul255(i32 %x) nounwind {
+; RV32I-LABEL: orc_b_i32_mul255:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lui a1, 4112
+; RV32I-NEXT:    addi a1, a1, 257
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 8
+; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: orc_b_i32_mul255:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    lui a1, 4112
+; RV32ZBB-NEXT:    addi a1, a1, 257
+; RV32ZBB-NEXT:    and a0, a0, a1
+; RV32ZBB-NEXT:    orc.b a0, a0
+; RV32ZBB-NEXT:    ret
+entry:
+  %and = and i32 %x, 16843009
+  %mul = mul nuw nsw i32 %and, 255
+  ret i32 %mul
+}
+
+
+define i32 @orc_b_i32_sub_shl8x_x_lsb(i32  %x)  {
+; RV32I-LABEL: orc_b_i32_sub_shl8x_x_lsb:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lui a1, 4112
+; RV32I-NEXT:    addi a1, a1, 257
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 8
+; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: orc_b_i32_sub_shl8x_x_lsb:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    lui a1, 4112
+; RV32ZBB-NEXT:    addi a1, a1, 257
+; RV32ZBB-NEXT:    and a0, a0, a1
+; RV32ZBB-NEXT:    orc.b a0, a0
+; RV32ZBB-NEXT:    ret
+entry:
+  %and = and i32 %x, 16843009
+  %sub = mul nuw i32 %and, 255
+  ret i32 %sub
+}
+
+define i32 @orc_b_i32_sub_shl8x_x_lsb_preshifted(i32 %x){
+; RV32I-LABEL: orc_b_i32_sub_shl8x_x_lsb_preshifted:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    srli a0, a0, 11
+; RV32I-NEXT:    lui a1, 16
+; RV32I-NEXT:    addi a1, a1, 257
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 8
+; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: orc_b_i32_sub_shl8x_x_lsb_preshifted:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    srli a0, a0, 11
+; RV32ZBB-NEXT:    lui a1, 16
+; RV32ZBB-NEXT:    addi a1, a1, 257
+; RV32ZBB-NEXT:    and a0, a0, a1
+; RV32ZBB-NEXT:    orc.b a0, a0
+; RV32ZBB-NEXT:    ret
+entry:
+  %shr = lshr i32 %x, 11
+  %and = and i32 %shr, 16843009
+  %sub = mul nuw i32 %and, 255
+  ret i32 %sub
+}
+
+
+define  i32 @orc_b_i32_sub_shl8x_x_b1(i32  %x)  {
+; RV32I-LABEL: orc_b_i32_sub_shl8x_x_b1:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lui a1, 8224
+; RV32I-NEXT:    addi a1, a1, 514
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 7
+; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: orc_b_i32_sub_shl8x_x_b1:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    lui a1, 8224
+; RV32ZBB-NEXT:    addi a1, a1, 514
+; RV32ZBB-NEXT:    and a0, a0, a1
+; RV32ZBB-NEXT:    orc.b a0, a0
+; RV32ZBB-NEXT:    ret
+entry:
+  %and = and i32 %x, 33686018
+  %shl = shl i32 %and, 7
+  %shr = lshr exact i32 %and, 1
+  %sub = sub nsw i32 %shl, %shr
+  ret i32 %sub
+}
+
+
+define  i32 @orc_b_i32_sub_shl8x_x_b2(i32  %x)  {
+; RV32I-LABEL: orc_b_i32_sub_shl8x_x_b2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lui a1, 16448
+; RV32I-NEXT:    addi a1, a1, 1028
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 6
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: orc_b_i32_sub_shl8x_x_b2:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    lui a1, 16448
+; RV32ZBB-NEXT:    addi a1, a1, 1028
+; RV32ZBB-NEXT:    and a0, a0, a1
+; RV32ZBB-NEXT:    orc.b a0, a0
+; RV32ZBB-NEXT:    ret
+entry:
+  %and = and i32 %x, 67372036
+  %shl = shl i32 %and, 6
+  %shr = lshr exact i32 %and, 2
+  %sub = sub nsw i32 %shl, %shr
+  ret i32 %sub
+}
+
+
+define i32 @orc_b_i32_sub_shl8x_x_b3(i32  %x)  {
+; CHECK-LABEL: orc_b_i32_sub_shl8x_x_b3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a1, 24672
+; CHECK-NEXT:    addi a1, a1, 1542
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    slli a1, a0, 5
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    ret
+entry:
+  %and = and i32 %x, 101058054
+  %shl = shl nuw i32 %and, 5
+  %shr = lshr i32 %and, 3
+  %sub = sub nsw i32 %shl, %shr
+  ret i32 %sub
+}
+
+
+define  i32 @orc_b_i32_sub_shl8x_x_b4(i32  %x)  {
+; CHECK-LABEL: orc_b_i32_sub_shl8x_x_b4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a1, 32897
+; CHECK-NEXT:    addi a1, a1, -2040
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    slli a1, a0, 4
+; CHECK-NEXT:    srli a0, a0, 4
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    ret
+entry:
+  %and = and i32 %x, 134744072
+  %shl = shl nuw i32 %and, 4
+  %shr = lshr i32 %and, 4
+  %sub = sub nsw i32 %shl, %shr
+  ret i32 %sub
+}
+
+
+define  i32 @orc_b_i32_sub_shl8x_x_b5(i32  %x)  {
+; CHECK-LABEL: orc_b_i32_sub_shl8x_x_b5:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a1, 65793
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    slli a1, a0, 3
+; CHECK-NEXT:    srli a0, a0, 5
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    ret
+entry:
+  %and = and i32 %x, 269488144
+  %shl = shl nuw i32 %and, 3
+  %shr = lshr i32 %and, 5
+  %sub = sub nsw i32 %shl, %shr
+  ret i32 %sub
+}
+
+
+define i32 @orc_b_i32_sub_shl8x_x_b6(i32 %x)  {
+; CHECK-LABEL: orc_b_i32_sub_shl8x_x_b6:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a1, 131586
+; CHECK-NEXT:    addi a1, a1, 32
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    slli a1, a0, 2
+; CHECK-NEXT:    srli a0, a0, 6
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    ret
+entry:
+  %and = and i32 %x, 538976288
+  %shl = shl nuw i32 %and, 2
+  %shr = lshr i32 %and, 6
+  %sub = sub nsw i32 %shl, %shr
+  ret i32 %sub
+}
+
+
+define i32 @orc_b_i32_sub_shl8x_x_b7(i32 %x)  {
+; CHECK-LABEL: orc_b_i32_sub_shl8x_x_b7:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a1, 263172
+; CHECK-NEXT:    addi a1, a1, 64
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    slli a1, a0, 1
+; CHECK-NEXT:    srli a0, a0, 7
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    ret
+entry:
+  %and = and i32 %x, 1077952576
+  %shl = shl nuw i32 %and, 1
+  %shr = lshr i32 %and, 7
+  %sub = sub nsw i32 %shl, %shr
+  ret i32 %sub
+}
+
+define i32 @orc_b_i32_sub_shl8x_x_b1_shl_used(i32 %x, ptr %arr) {
+; RV32I-LABEL: orc_b_i32_sub_shl8x_x_b1_shl_used:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lui a2, 8224
+; RV32I-NEXT:    addi a2, a2, 514
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    slli a2, a0, 7
+; RV32I-NEXT:    srli a3, a0, 1
+; RV32I-NEXT:    sub a0, a2, a3
+; RV32I-NEXT:    sw a3, 0(a1)
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: orc_b_i32_sub_shl8x_x_b1_shl_used:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    lui a2, 8224
+; RV32ZBB-NEXT:    addi a2, a2, 514
+; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    srli a2, a0, 1
+; RV32ZBB-NEXT:    orc.b a0, a0
+; RV32ZBB-NEXT:    sw a2, 0(a1)
+; RV32ZBB-NEXT:    ret
+entry:
+  %and = and i32 %x, 33686018
+  %shl = shl i32 %and, 7
+  %shr = lshr exact i32 %and, 1
+  store i32 %shr, ptr %arr, align 4
+  %sub = sub nsw i32 %shl, %shr
+  ret i32 %sub
+}
+
+define i32 @orc_b_i32_sub_shl8x_x_b1_srl_used(i32  %x, ptr %arr) {
+; RV32I-LABEL: orc_b_i32_sub_shl8x_x_b1_srl_used:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lui a2, 8224
+; RV32I-NEXT:    addi a2, a2, 514
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    slli a2, a0, 7
+; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    sub a0, a2, a0
+; RV32I-NEXT:    sw a2, 0(a1)
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: orc_b_i32_sub_shl8x_x_b1_srl_used:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    lui a2, 8224
+; RV32ZBB-NEXT:    addi a2, a2, 514
+; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    slli a2, a0, 7
+; RV32ZBB-NEXT:    orc.b a0, a0
+; RV32ZBB-NEXT:    sw a2, 0(a1)
+; RV32ZBB-NEXT:    ret
+entry:
+  %and = and i32 %x, 33686018
+  %shl = shl i32 %and, 7
+  %shr = lshr exact i32 %and, 1
+  store i32 %shl, ptr %arr, align 4
+  %sub = sub nsw i32 %shl, %shr
+  ret i32 %sub
+}
+
+
+define i32 @orc_b_i32_sub_shl8x_x_b1_not_used(i32  %x, ptr %arr) {
+; RV32I-LABEL: orc_b_i32_sub_shl8x_x_b1_not_used:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    lui a1, 8224
+; RV32I-NEXT:    addi a1, a1, 514
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 7
+; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: orc_b_i32_sub_shl8x_x_b1_not_used:
+; RV32ZBB:       # %bb.0: # %entry
+; RV32ZBB-NEXT:    lui a1, 8224
+; RV32ZBB-NEXT:    addi a1, a1, 514
+; RV32ZBB-NEXT:    and a0, a0, a1
+; RV32ZBB-NEXT:    orc.b a0, a0
+; RV32ZBB-NEXT:    ret
+entry:
+  %and = and i32 %x, 33686018
+  %shl = shl i32 %and, 7
+  %shr = lshr exact i32 %and, 1
+  %sub = sub nsw i32 %shl, %shr
+  ret i32 %sub
+}
+
+define i32 @orc_b_i32_sub_shl8x_x_shl_used(i32  %x, ptr %arr){
+; CHECK-LABEL: orc_b_i32_sub_shl8x_x_shl_used:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a2, 4112
+; CHECK-NEXT:    addi a2, a2, 257
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    slli a2, a0, 8
+; CHECK-NEXT:    sub a0, a2, a0
+; CHECK-NEXT:    sw a2, 0(a1)
+; CHECK-NEXT:    ret
+entry:
+  %and = and i32 %x, 16843009
+  %shl = shl i32 %and, 8
+  store i32 %shl, ptr %arr, align 4
+  %sub = mul nuw i32 %and, 255
+  ret i32 %sub
+}
+
+define i32 @orc_b_i32_sub_shl8x_x_b1_both_used(i32  %x, ptr %arr) {
+; CHECK-LABEL: orc_b_i32_sub_shl8x_x_b1_both_used:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a2, 8224
+; CHECK-NEXT:    addi a2, a2, 514
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    slli a2, a0, 7
+; CHECK-NEXT:    srli a3, a0, 1
+; CHECK-NEXT:    sw a2, 0(a1)
+; CHECK-NEXT:    sub a0, a2, a3
+; CHECK-NEXT:    sw a3, 4(a1)
+; CHECK-NEXT:    ret
+entry:
+  %and = and i32 %x, 33686018
+  %shl = shl i32 %and, 7
+  %shr = lshr exact i32 %and, 1
+  store i32 %shl, ptr %arr, align 4
+  %arrayidx1 = getelementptr inbounds i8, ptr %arr, i32 4
+  store i32 %shr, ptr %arrayidx1, align 4
+  %sub = sub nsw i32 %shl, %shr
+  ret i32 %sub
+}
+
+
+define i32 @orc_b_i32_sub_x_shr8x(i32 %x)  {
+; CHECK-LABEL: orc_b_i32_sub_x_shr8x:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a1, 4112
+; CHECK-NEXT:    addi a1, a1, 257
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    srli a1, a0, 8
+; CHECK-NEXT:    sub a0, a0, a1
+; CHECK-NEXT:    ret
+entry:
+  %and = and i32 %x, 16843009
+  %shr = lshr i32 %and, 8
+  %sub = sub nsw i32 %and, %shr
+  ret i32 %sub
+}

From 9a696b68b735fa01276d16d39370f9102fee4a0b Mon Sep 17 00:00:00 2001
From: Emilio Cota <ecg@google.com>
Date: Fri, 11 Oct 2024 08:18:11 -0400
Subject: [PATCH 163/177] Revert "[NVPTX] Prefer prmt.b32 over bfi.b32
 (#110766)"

This reverts commit 3f9998af4f79e95fe8be615df9d6b898008044b9.

It breaks downstream tests with egregious numerical differences.

Unfortunately no upstream tests are broken, but the fact that
a prior iteration of the commit (pre-optimization) does work
with our downstream tests (coming from the Triton repo) supports
the claim that the final version of the commit is incorrect.

Reverting now so that the original author can evaluate.
---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp  |  31 +-
 llvm/test/CodeGen/NVPTX/i8x4-instructions.ll | 614 +++++++++----------
 llvm/test/CodeGen/NVPTX/sext-setcc.ll        |  18 +-
 3 files changed, 328 insertions(+), 335 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index d95f8f214be557..57bc5fe0ac361c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -2332,23 +2332,20 @@ SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
     // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us
     // to optimize calculation of constant parts.
     if (VT == MVT::v4i8) {
-      SDValue PRMT__10 = DAG.getNode(
-          NVPTXISD::PRMT, DL, MVT::v4i8,
-          {DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32),
-           DAG.getAnyExtOrTrunc(Op->getOperand(1), DL, MVT::i32),
-           DAG.getConstant(0x3340, DL, MVT::i32),
-           DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)});
-      SDValue PRMT32__ = DAG.getNode(
-          NVPTXISD::PRMT, DL, MVT::v4i8,
-          {DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32),
-           DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32),
-           DAG.getConstant(0x4033, DL, MVT::i32),
-           DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)});
-      SDValue PRMT3210 = DAG.getNode(
-          NVPTXISD::PRMT, DL, MVT::v4i8,
-          {PRMT__10, PRMT32__, DAG.getConstant(0x5410, DL, MVT::i32),
-           DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)});
-      return DAG.getNode(ISD::BITCAST, DL, VT, PRMT3210);
+      SDValue C8 = DAG.getConstant(8, DL, MVT::i32);
+      SDValue E01 = DAG.getNode(
+          NVPTXISD::BFI, DL, MVT::i32,
+          DAG.getAnyExtOrTrunc(Op->getOperand(1), DL, MVT::i32),
+          DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32), C8, C8);
+      SDValue E012 =
+          DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
+                      DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32),
+                      E01, DAG.getConstant(16, DL, MVT::i32), C8);
+      SDValue E0123 =
+          DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
+                      DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32),
+                      E012, DAG.getConstant(24, DL, MVT::i32), C8);
+      return DAG.getNode(ISD::BITCAST, DL, VT, E0123);
     }
     return Op;
   }
diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
index 84dde539ce4c47..96a4359d0ec43e 100644
--- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
@@ -101,38 +101,38 @@ define <4 x i8> @test_add(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-LABEL: test_add(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<13>;
-; CHECK-NEXT:    .reg .b32 %r<18>;
+; CHECK-NEXT:    .reg .b32 %r<19>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_add_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_add_param_0];
-; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r3, %r2, 0, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 0, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs2, %r4;
 ; CHECK-NEXT:    add.s16 %rs3, %rs2, %rs1;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs3;
-; CHECK-NEXT:    bfe.u32 %r6, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r2, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs4, %r6;
-; CHECK-NEXT:    bfe.u32 %r7, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
 ; CHECK-NEXT:    add.s16 %rs6, %rs5, %rs4;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT:    prmt.b32 %r9, %r8, %r5, 16435;
-; CHECK-NEXT:    bfe.u32 %r10, %r2, 8, 8;
+; CHECK-NEXT:    bfi.b32 %r9, %r8, %r5, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r2, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
-; CHECK-NEXT:    bfe.u32 %r11, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r11, %r1, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs8, %r11;
 ; CHECK-NEXT:    add.s16 %rs9, %rs8, %rs7;
 ; CHECK-NEXT:    cvt.u32.u16 %r12, %rs9;
-; CHECK-NEXT:    bfe.u32 %r13, %r2, 0, 8;
-; CHECK-NEXT:    cvt.u16.u32 %rs10, %r13;
-; CHECK-NEXT:    bfe.u32 %r14, %r1, 0, 8;
-; CHECK-NEXT:    cvt.u16.u32 %rs11, %r14;
+; CHECK-NEXT:    bfi.b32 %r13, %r12, %r9, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r2, 24, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs10, %r14;
+; CHECK-NEXT:    bfe.u32 %r15, %r1, 24, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs11, %r15;
 ; CHECK-NEXT:    add.s16 %rs12, %rs11, %rs10;
-; CHECK-NEXT:    cvt.u32.u16 %r15, %rs12;
-; CHECK-NEXT:    prmt.b32 %r16, %r15, %r12, 13120;
-; CHECK-NEXT:    prmt.b32 %r17, %r16, %r9, 21520;
+; CHECK-NEXT:    cvt.u32.u16 %r16, %rs12;
+; CHECK-NEXT:    bfi.b32 %r17, %r16, %r13, 24, 8;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r17;
 ; CHECK-NEXT:    ret;
   %r = add <4 x i8> %a, %b
@@ -143,29 +143,29 @@ define <4 x i8> @test_add_imm_0(<4 x i8> %a) #0 {
 ; CHECK-LABEL: test_add_imm_0(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<9>;
-; CHECK-NEXT:    .reg .b32 %r<13>;
+; CHECK-NEXT:    .reg .b32 %r<14>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_add_imm_0_param_0];
-; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r2, %r1, 0, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
-; CHECK-NEXT:    add.s16 %rs2, %rs1, 4;
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
-; CHECK-NEXT:    add.s16 %rs4, %rs3, 3;
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 2;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs4;
-; CHECK-NEXT:    prmt.b32 %r6, %r5, %r3, 16435;
-; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
+; CHECK-NEXT:    bfi.b32 %r6, %r5, %r3, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r1, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
-; CHECK-NEXT:    add.s16 %rs6, %rs5, 2;
+; CHECK-NEXT:    add.s16 %rs6, %rs5, 3;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT:    bfe.u32 %r9, %r1, 0, 8;
-; CHECK-NEXT:    cvt.u16.u32 %rs7, %r9;
-; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
-; CHECK-NEXT:    cvt.u32.u16 %r10, %rs8;
-; CHECK-NEXT:    prmt.b32 %r11, %r10, %r8, 13120;
-; CHECK-NEXT:    prmt.b32 %r12, %r11, %r6, 21520;
+; CHECK-NEXT:    bfi.b32 %r9, %r8, %r6, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r1, 24, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
+; CHECK-NEXT:    add.s16 %rs8, %rs7, 4;
+; CHECK-NEXT:    cvt.u32.u16 %r11, %rs8;
+; CHECK-NEXT:    bfi.b32 %r12, %r11, %r9, 24, 8;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r12;
 ; CHECK-NEXT:    ret;
   %r = add <4 x i8> <i8 1, i8 2, i8 3, i8 4>, %a
@@ -176,29 +176,29 @@ define <4 x i8> @test_add_imm_1(<4 x i8> %a) #0 {
 ; CHECK-LABEL: test_add_imm_1(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<9>;
-; CHECK-NEXT:    .reg .b32 %r<13>;
+; CHECK-NEXT:    .reg .b32 %r<14>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_add_imm_1_param_0];
-; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r2, %r1, 0, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
-; CHECK-NEXT:    add.s16 %rs2, %rs1, 4;
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
-; CHECK-NEXT:    add.s16 %rs4, %rs3, 3;
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 2;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs4;
-; CHECK-NEXT:    prmt.b32 %r6, %r5, %r3, 16435;
-; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
+; CHECK-NEXT:    bfi.b32 %r6, %r5, %r3, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r1, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
-; CHECK-NEXT:    add.s16 %rs6, %rs5, 2;
+; CHECK-NEXT:    add.s16 %rs6, %rs5, 3;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT:    bfe.u32 %r9, %r1, 0, 8;
-; CHECK-NEXT:    cvt.u16.u32 %rs7, %r9;
-; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
-; CHECK-NEXT:    cvt.u32.u16 %r10, %rs8;
-; CHECK-NEXT:    prmt.b32 %r11, %r10, %r8, 13120;
-; CHECK-NEXT:    prmt.b32 %r12, %r11, %r6, 21520;
+; CHECK-NEXT:    bfi.b32 %r9, %r8, %r6, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r1, 24, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
+; CHECK-NEXT:    add.s16 %rs8, %rs7, 4;
+; CHECK-NEXT:    cvt.u32.u16 %r11, %rs8;
+; CHECK-NEXT:    bfi.b32 %r12, %r11, %r9, 24, 8;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r12;
 ; CHECK-NEXT:    ret;
   %r = add <4 x i8> %a, <i8 1, i8 2, i8 3, i8 4>
@@ -209,38 +209,38 @@ define <4 x i8> @test_sub(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-LABEL: test_sub(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<13>;
-; CHECK-NEXT:    .reg .b32 %r<18>;
+; CHECK-NEXT:    .reg .b32 %r<19>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_sub_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_sub_param_0];
-; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r3, %r2, 0, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 0, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs2, %r4;
 ; CHECK-NEXT:    sub.s16 %rs3, %rs2, %rs1;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs3;
-; CHECK-NEXT:    bfe.u32 %r6, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r2, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs4, %r6;
-; CHECK-NEXT:    bfe.u32 %r7, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
 ; CHECK-NEXT:    sub.s16 %rs6, %rs5, %rs4;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT:    prmt.b32 %r9, %r8, %r5, 16435;
-; CHECK-NEXT:    bfe.u32 %r10, %r2, 8, 8;
+; CHECK-NEXT:    bfi.b32 %r9, %r8, %r5, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r2, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
-; CHECK-NEXT:    bfe.u32 %r11, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r11, %r1, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs8, %r11;
 ; CHECK-NEXT:    sub.s16 %rs9, %rs8, %rs7;
 ; CHECK-NEXT:    cvt.u32.u16 %r12, %rs9;
-; CHECK-NEXT:    bfe.u32 %r13, %r2, 0, 8;
-; CHECK-NEXT:    cvt.u16.u32 %rs10, %r13;
-; CHECK-NEXT:    bfe.u32 %r14, %r1, 0, 8;
-; CHECK-NEXT:    cvt.u16.u32 %rs11, %r14;
+; CHECK-NEXT:    bfi.b32 %r13, %r12, %r9, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r2, 24, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs10, %r14;
+; CHECK-NEXT:    bfe.u32 %r15, %r1, 24, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs11, %r15;
 ; CHECK-NEXT:    sub.s16 %rs12, %rs11, %rs10;
-; CHECK-NEXT:    cvt.u32.u16 %r15, %rs12;
-; CHECK-NEXT:    prmt.b32 %r16, %r15, %r12, 13120;
-; CHECK-NEXT:    prmt.b32 %r17, %r16, %r9, 21520;
+; CHECK-NEXT:    cvt.u32.u16 %r16, %rs12;
+; CHECK-NEXT:    bfi.b32 %r17, %r16, %r13, 24, 8;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r17;
 ; CHECK-NEXT:    ret;
   %r = sub <4 x i8> %a, %b
@@ -251,38 +251,38 @@ define <4 x i8> @test_smax(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-LABEL: test_smax(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<26>;
+; CHECK-NEXT:    .reg .b32 %r<27>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_smax_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_smax_param_0];
-; CHECK-NEXT:    bfe.s32 %r3, %r2, 0, 8;
-; CHECK-NEXT:    bfe.s32 %r4, %r1, 0, 8;
+; CHECK-NEXT:    bfe.s32 %r3, %r2, 24, 8;
+; CHECK-NEXT:    bfe.s32 %r4, %r1, 24, 8;
 ; CHECK-NEXT:    setp.gt.s32 %p1, %r4, %r3;
-; CHECK-NEXT:    bfe.s32 %r5, %r2, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r6, %r1, 8, 8;
+; CHECK-NEXT:    bfe.s32 %r5, %r2, 16, 8;
+; CHECK-NEXT:    bfe.s32 %r6, %r1, 16, 8;
 ; CHECK-NEXT:    setp.gt.s32 %p2, %r6, %r5;
-; CHECK-NEXT:    bfe.s32 %r7, %r2, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r8, %r1, 16, 8;
+; CHECK-NEXT:    bfe.s32 %r7, %r2, 8, 8;
+; CHECK-NEXT:    bfe.s32 %r8, %r1, 8, 8;
 ; CHECK-NEXT:    setp.gt.s32 %p3, %r8, %r7;
-; CHECK-NEXT:    bfe.s32 %r9, %r2, 24, 8;
-; CHECK-NEXT:    bfe.s32 %r10, %r1, 24, 8;
+; CHECK-NEXT:    bfe.s32 %r9, %r2, 0, 8;
+; CHECK-NEXT:    bfe.s32 %r10, %r1, 0, 8;
 ; CHECK-NEXT:    setp.gt.s32 %p4, %r10, %r9;
-; CHECK-NEXT:    bfe.u32 %r11, %r1, 0, 8;
-; CHECK-NEXT:    bfe.u32 %r12, %r1, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r13, %r1, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r14, %r1, 24, 8;
-; CHECK-NEXT:    bfe.u32 %r15, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r11, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r13, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r15, %r2, 0, 8;
 ; CHECK-NEXT:    selp.b32 %r16, %r14, %r15, %p4;
-; CHECK-NEXT:    bfe.u32 %r17, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r17, %r2, 8, 8;
 ; CHECK-NEXT:    selp.b32 %r18, %r13, %r17, %p3;
-; CHECK-NEXT:    prmt.b32 %r19, %r18, %r16, 16435;
-; CHECK-NEXT:    bfe.u32 %r20, %r2, 8, 8;
+; CHECK-NEXT:    bfi.b32 %r19, %r18, %r16, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r20, %r2, 16, 8;
 ; CHECK-NEXT:    selp.b32 %r21, %r12, %r20, %p2;
-; CHECK-NEXT:    bfe.u32 %r22, %r2, 0, 8;
-; CHECK-NEXT:    selp.b32 %r23, %r11, %r22, %p1;
-; CHECK-NEXT:    prmt.b32 %r24, %r23, %r21, 13120;
-; CHECK-NEXT:    prmt.b32 %r25, %r24, %r19, 21520;
+; CHECK-NEXT:    bfi.b32 %r22, %r21, %r19, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r23, %r2, 24, 8;
+; CHECK-NEXT:    selp.b32 %r24, %r11, %r23, %p1;
+; CHECK-NEXT:    bfi.b32 %r25, %r24, %r22, 24, 8;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r25;
 ; CHECK-NEXT:    ret;
   %cmp = icmp sgt <4 x i8> %a, %b
@@ -294,30 +294,30 @@ define <4 x i8> @test_umax(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-LABEL: test_umax(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<18>;
+; CHECK-NEXT:    .reg .b32 %r<19>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_umax_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_umax_param_0];
-; CHECK-NEXT:    bfe.u32 %r3, %r2, 0, 8;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 24, 8;
 ; CHECK-NEXT:    setp.hi.u32 %p1, %r4, %r3;
-; CHECK-NEXT:    bfe.u32 %r5, %r2, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r6, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r1, 16, 8;
 ; CHECK-NEXT:    setp.hi.u32 %p2, %r6, %r5;
-; CHECK-NEXT:    bfe.u32 %r7, %r2, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r8, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r8, %r1, 8, 8;
 ; CHECK-NEXT:    setp.hi.u32 %p3, %r8, %r7;
-; CHECK-NEXT:    bfe.u32 %r9, %r2, 24, 8;
-; CHECK-NEXT:    bfe.u32 %r10, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r9, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r1, 0, 8;
 ; CHECK-NEXT:    setp.hi.u32 %p4, %r10, %r9;
 ; CHECK-NEXT:    selp.b32 %r11, %r10, %r9, %p4;
 ; CHECK-NEXT:    selp.b32 %r12, %r8, %r7, %p3;
-; CHECK-NEXT:    prmt.b32 %r13, %r12, %r11, 16435;
+; CHECK-NEXT:    bfi.b32 %r13, %r12, %r11, 8, 8;
 ; CHECK-NEXT:    selp.b32 %r14, %r6, %r5, %p2;
-; CHECK-NEXT:    selp.b32 %r15, %r4, %r3, %p1;
-; CHECK-NEXT:    prmt.b32 %r16, %r15, %r14, 13120;
-; CHECK-NEXT:    prmt.b32 %r17, %r16, %r13, 21520;
+; CHECK-NEXT:    bfi.b32 %r15, %r14, %r13, 16, 8;
+; CHECK-NEXT:    selp.b32 %r16, %r4, %r3, %p1;
+; CHECK-NEXT:    bfi.b32 %r17, %r16, %r15, 24, 8;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r17;
 ; CHECK-NEXT:    ret;
   %cmp = icmp ugt <4 x i8> %a, %b
@@ -329,38 +329,38 @@ define <4 x i8> @test_smin(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-LABEL: test_smin(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<26>;
+; CHECK-NEXT:    .reg .b32 %r<27>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_smin_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_smin_param_0];
-; CHECK-NEXT:    bfe.s32 %r3, %r2, 0, 8;
-; CHECK-NEXT:    bfe.s32 %r4, %r1, 0, 8;
+; CHECK-NEXT:    bfe.s32 %r3, %r2, 24, 8;
+; CHECK-NEXT:    bfe.s32 %r4, %r1, 24, 8;
 ; CHECK-NEXT:    setp.le.s32 %p1, %r4, %r3;
-; CHECK-NEXT:    bfe.s32 %r5, %r2, 8, 8;
-; CHECK-NEXT:    bfe.s32 %r6, %r1, 8, 8;
+; CHECK-NEXT:    bfe.s32 %r5, %r2, 16, 8;
+; CHECK-NEXT:    bfe.s32 %r6, %r1, 16, 8;
 ; CHECK-NEXT:    setp.le.s32 %p2, %r6, %r5;
-; CHECK-NEXT:    bfe.s32 %r7, %r2, 16, 8;
-; CHECK-NEXT:    bfe.s32 %r8, %r1, 16, 8;
+; CHECK-NEXT:    bfe.s32 %r7, %r2, 8, 8;
+; CHECK-NEXT:    bfe.s32 %r8, %r1, 8, 8;
 ; CHECK-NEXT:    setp.le.s32 %p3, %r8, %r7;
-; CHECK-NEXT:    bfe.s32 %r9, %r2, 24, 8;
-; CHECK-NEXT:    bfe.s32 %r10, %r1, 24, 8;
+; CHECK-NEXT:    bfe.s32 %r9, %r2, 0, 8;
+; CHECK-NEXT:    bfe.s32 %r10, %r1, 0, 8;
 ; CHECK-NEXT:    setp.le.s32 %p4, %r10, %r9;
-; CHECK-NEXT:    bfe.u32 %r11, %r1, 0, 8;
-; CHECK-NEXT:    bfe.u32 %r12, %r1, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r13, %r1, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r14, %r1, 24, 8;
-; CHECK-NEXT:    bfe.u32 %r15, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r11, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r13, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r15, %r2, 0, 8;
 ; CHECK-NEXT:    selp.b32 %r16, %r14, %r15, %p4;
-; CHECK-NEXT:    bfe.u32 %r17, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r17, %r2, 8, 8;
 ; CHECK-NEXT:    selp.b32 %r18, %r13, %r17, %p3;
-; CHECK-NEXT:    prmt.b32 %r19, %r18, %r16, 16435;
-; CHECK-NEXT:    bfe.u32 %r20, %r2, 8, 8;
+; CHECK-NEXT:    bfi.b32 %r19, %r18, %r16, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r20, %r2, 16, 8;
 ; CHECK-NEXT:    selp.b32 %r21, %r12, %r20, %p2;
-; CHECK-NEXT:    bfe.u32 %r22, %r2, 0, 8;
-; CHECK-NEXT:    selp.b32 %r23, %r11, %r22, %p1;
-; CHECK-NEXT:    prmt.b32 %r24, %r23, %r21, 13120;
-; CHECK-NEXT:    prmt.b32 %r25, %r24, %r19, 21520;
+; CHECK-NEXT:    bfi.b32 %r22, %r21, %r19, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r23, %r2, 24, 8;
+; CHECK-NEXT:    selp.b32 %r24, %r11, %r23, %p1;
+; CHECK-NEXT:    bfi.b32 %r25, %r24, %r22, 24, 8;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r25;
 ; CHECK-NEXT:    ret;
   %cmp = icmp sle <4 x i8> %a, %b
@@ -372,30 +372,30 @@ define <4 x i8> @test_umin(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-LABEL: test_umin(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<18>;
+; CHECK-NEXT:    .reg .b32 %r<19>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_umin_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_umin_param_0];
-; CHECK-NEXT:    bfe.u32 %r3, %r2, 0, 8;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 24, 8;
 ; CHECK-NEXT:    setp.ls.u32 %p1, %r4, %r3;
-; CHECK-NEXT:    bfe.u32 %r5, %r2, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r6, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r1, 16, 8;
 ; CHECK-NEXT:    setp.ls.u32 %p2, %r6, %r5;
-; CHECK-NEXT:    bfe.u32 %r7, %r2, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r8, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r8, %r1, 8, 8;
 ; CHECK-NEXT:    setp.ls.u32 %p3, %r8, %r7;
-; CHECK-NEXT:    bfe.u32 %r9, %r2, 24, 8;
-; CHECK-NEXT:    bfe.u32 %r10, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r9, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r1, 0, 8;
 ; CHECK-NEXT:    setp.ls.u32 %p4, %r10, %r9;
 ; CHECK-NEXT:    selp.b32 %r11, %r10, %r9, %p4;
 ; CHECK-NEXT:    selp.b32 %r12, %r8, %r7, %p3;
-; CHECK-NEXT:    prmt.b32 %r13, %r12, %r11, 16435;
+; CHECK-NEXT:    bfi.b32 %r13, %r12, %r11, 8, 8;
 ; CHECK-NEXT:    selp.b32 %r14, %r6, %r5, %p2;
-; CHECK-NEXT:    selp.b32 %r15, %r4, %r3, %p1;
-; CHECK-NEXT:    prmt.b32 %r16, %r15, %r14, 13120;
-; CHECK-NEXT:    prmt.b32 %r17, %r16, %r13, 21520;
+; CHECK-NEXT:    bfi.b32 %r15, %r14, %r13, 16, 8;
+; CHECK-NEXT:    selp.b32 %r16, %r4, %r3, %p1;
+; CHECK-NEXT:    bfi.b32 %r17, %r16, %r15, 24, 8;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r17;
 ; CHECK-NEXT:    ret;
   %cmp = icmp ule <4 x i8> %a, %b
@@ -407,35 +407,35 @@ define <4 x i8> @test_eq(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 {
 ; CHECK-LABEL: test_eq(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<23>;
+; CHECK-NEXT:    .reg .b32 %r<24>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r3, [test_eq_param_2];
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_eq_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_eq_param_0];
-; CHECK-NEXT:    bfe.u32 %r4, %r2, 0, 8;
-; CHECK-NEXT:    bfe.u32 %r5, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r5, %r1, 24, 8;
 ; CHECK-NEXT:    setp.eq.u32 %p1, %r5, %r4;
-; CHECK-NEXT:    bfe.u32 %r6, %r2, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r1, 16, 8;
 ; CHECK-NEXT:    setp.eq.u32 %p2, %r7, %r6;
-; CHECK-NEXT:    bfe.u32 %r8, %r2, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r9, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r8, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r9, %r1, 8, 8;
 ; CHECK-NEXT:    setp.eq.u32 %p3, %r9, %r8;
-; CHECK-NEXT:    bfe.u32 %r10, %r2, 24, 8;
-; CHECK-NEXT:    bfe.u32 %r11, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r11, %r1, 0, 8;
 ; CHECK-NEXT:    setp.eq.u32 %p4, %r11, %r10;
-; CHECK-NEXT:    bfe.u32 %r12, %r3, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r3, 0, 8;
 ; CHECK-NEXT:    selp.b32 %r13, %r11, %r12, %p4;
-; CHECK-NEXT:    bfe.u32 %r14, %r3, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r3, 8, 8;
 ; CHECK-NEXT:    selp.b32 %r15, %r9, %r14, %p3;
-; CHECK-NEXT:    prmt.b32 %r16, %r15, %r13, 16435;
-; CHECK-NEXT:    bfe.u32 %r17, %r3, 8, 8;
+; CHECK-NEXT:    bfi.b32 %r16, %r15, %r13, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r17, %r3, 16, 8;
 ; CHECK-NEXT:    selp.b32 %r18, %r7, %r17, %p2;
-; CHECK-NEXT:    bfe.u32 %r19, %r3, 0, 8;
-; CHECK-NEXT:    selp.b32 %r20, %r5, %r19, %p1;
-; CHECK-NEXT:    prmt.b32 %r21, %r20, %r18, 13120;
-; CHECK-NEXT:    prmt.b32 %r22, %r21, %r16, 21520;
+; CHECK-NEXT:    bfi.b32 %r19, %r18, %r16, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r20, %r3, 24, 8;
+; CHECK-NEXT:    selp.b32 %r21, %r5, %r20, %p1;
+; CHECK-NEXT:    bfi.b32 %r22, %r21, %r19, 24, 8;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r22;
 ; CHECK-NEXT:    ret;
   %cmp = icmp eq <4 x i8> %a, %b
@@ -447,35 +447,35 @@ define <4 x i8> @test_ne(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 {
 ; CHECK-LABEL: test_ne(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<23>;
+; CHECK-NEXT:    .reg .b32 %r<24>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r3, [test_ne_param_2];
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_ne_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_ne_param_0];
-; CHECK-NEXT:    bfe.u32 %r4, %r2, 0, 8;
-; CHECK-NEXT:    bfe.u32 %r5, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r5, %r1, 24, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p1, %r5, %r4;
-; CHECK-NEXT:    bfe.u32 %r6, %r2, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r1, 16, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p2, %r7, %r6;
-; CHECK-NEXT:    bfe.u32 %r8, %r2, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r9, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r8, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r9, %r1, 8, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p3, %r9, %r8;
-; CHECK-NEXT:    bfe.u32 %r10, %r2, 24, 8;
-; CHECK-NEXT:    bfe.u32 %r11, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r11, %r1, 0, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p4, %r11, %r10;
-; CHECK-NEXT:    bfe.u32 %r12, %r3, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r3, 0, 8;
 ; CHECK-NEXT:    selp.b32 %r13, %r11, %r12, %p4;
-; CHECK-NEXT:    bfe.u32 %r14, %r3, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r3, 8, 8;
 ; CHECK-NEXT:    selp.b32 %r15, %r9, %r14, %p3;
-; CHECK-NEXT:    prmt.b32 %r16, %r15, %r13, 16435;
-; CHECK-NEXT:    bfe.u32 %r17, %r3, 8, 8;
+; CHECK-NEXT:    bfi.b32 %r16, %r15, %r13, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r17, %r3, 16, 8;
 ; CHECK-NEXT:    selp.b32 %r18, %r7, %r17, %p2;
-; CHECK-NEXT:    bfe.u32 %r19, %r3, 0, 8;
-; CHECK-NEXT:    selp.b32 %r20, %r5, %r19, %p1;
-; CHECK-NEXT:    prmt.b32 %r21, %r20, %r18, 13120;
-; CHECK-NEXT:    prmt.b32 %r22, %r21, %r16, 21520;
+; CHECK-NEXT:    bfi.b32 %r19, %r18, %r16, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r20, %r3, 24, 8;
+; CHECK-NEXT:    selp.b32 %r21, %r5, %r20, %p1;
+; CHECK-NEXT:    bfi.b32 %r22, %r21, %r19, 24, 8;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r22;
 ; CHECK-NEXT:    ret;
   %cmp = icmp ne <4 x i8> %a, %b
@@ -487,38 +487,38 @@ define <4 x i8> @test_mul(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-LABEL: test_mul(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<13>;
-; CHECK-NEXT:    .reg .b32 %r<18>;
+; CHECK-NEXT:    .reg .b32 %r<19>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_mul_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_mul_param_0];
-; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r3, %r2, 0, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 0, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs2, %r4;
 ; CHECK-NEXT:    mul.lo.s16 %rs3, %rs2, %rs1;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs3;
-; CHECK-NEXT:    bfe.u32 %r6, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r2, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs4, %r6;
-; CHECK-NEXT:    bfe.u32 %r7, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
 ; CHECK-NEXT:    mul.lo.s16 %rs6, %rs5, %rs4;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT:    prmt.b32 %r9, %r8, %r5, 16435;
-; CHECK-NEXT:    bfe.u32 %r10, %r2, 8, 8;
+; CHECK-NEXT:    bfi.b32 %r9, %r8, %r5, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r2, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
-; CHECK-NEXT:    bfe.u32 %r11, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r11, %r1, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs8, %r11;
 ; CHECK-NEXT:    mul.lo.s16 %rs9, %rs8, %rs7;
 ; CHECK-NEXT:    cvt.u32.u16 %r12, %rs9;
-; CHECK-NEXT:    bfe.u32 %r13, %r2, 0, 8;
-; CHECK-NEXT:    cvt.u16.u32 %rs10, %r13;
-; CHECK-NEXT:    bfe.u32 %r14, %r1, 0, 8;
-; CHECK-NEXT:    cvt.u16.u32 %rs11, %r14;
+; CHECK-NEXT:    bfi.b32 %r13, %r12, %r9, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r2, 24, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs10, %r14;
+; CHECK-NEXT:    bfe.u32 %r15, %r1, 24, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs11, %r15;
 ; CHECK-NEXT:    mul.lo.s16 %rs12, %rs11, %rs10;
-; CHECK-NEXT:    cvt.u32.u16 %r15, %rs12;
-; CHECK-NEXT:    prmt.b32 %r16, %r15, %r12, 13120;
-; CHECK-NEXT:    prmt.b32 %r17, %r16, %r9, 21520;
+; CHECK-NEXT:    cvt.u32.u16 %r16, %rs12;
+; CHECK-NEXT:    bfi.b32 %r17, %r16, %r13, 24, 8;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r17;
 ; CHECK-NEXT:    ret;
   %r = mul <4 x i8> %a, %b
@@ -548,13 +548,12 @@ define <4 x i8> @test_or_computed(i8 %a) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u8 %rs1, [test_or_computed_param_0];
-; CHECK-NEXT:    mov.b32 %r1, 0;
-; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 16435;
-; CHECK-NEXT:    cvt.u32.u16 %r3, %rs1;
-; CHECK-NEXT:    prmt.b32 %r4, %r3, 0, 13120;
-; CHECK-NEXT:    prmt.b32 %r5, %r4, %r2, 21520;
-; CHECK-NEXT:    bfi.b32 %r6, 5, %r5, 8, 8;
-; CHECK-NEXT:    or.b32 %r8, %r6, %r5;
+; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
+; CHECK-NEXT:    bfi.b32 %r2, 0, %r1, 8, 8;
+; CHECK-NEXT:    bfi.b32 %r3, 0, %r2, 16, 8;
+; CHECK-NEXT:    bfi.b32 %r4, 0, %r3, 24, 8;
+; CHECK-NEXT:    bfi.b32 %r6, 5, %r4, 8, 8;
+; CHECK-NEXT:    or.b32 %r8, %r6, %r4;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r8;
 ; CHECK-NEXT:    ret;
   %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0
@@ -614,13 +613,12 @@ define <4 x i8> @test_xor_computed(i8 %a) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u8 %rs1, [test_xor_computed_param_0];
-; CHECK-NEXT:    mov.b32 %r1, 0;
-; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 16435;
-; CHECK-NEXT:    cvt.u32.u16 %r3, %rs1;
-; CHECK-NEXT:    prmt.b32 %r4, %r3, 0, 13120;
-; CHECK-NEXT:    prmt.b32 %r5, %r4, %r2, 21520;
-; CHECK-NEXT:    bfi.b32 %r6, 5, %r5, 8, 8;
-; CHECK-NEXT:    xor.b32 %r8, %r6, %r5;
+; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
+; CHECK-NEXT:    bfi.b32 %r2, 0, %r1, 8, 8;
+; CHECK-NEXT:    bfi.b32 %r3, 0, %r2, 16, 8;
+; CHECK-NEXT:    bfi.b32 %r4, 0, %r3, 24, 8;
+; CHECK-NEXT:    bfi.b32 %r6, 5, %r4, 8, 8;
+; CHECK-NEXT:    xor.b32 %r8, %r6, %r4;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r8;
 ; CHECK-NEXT:    ret;
   %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0
@@ -680,13 +678,12 @@ define <4 x i8> @test_and_computed(i8 %a) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u8 %rs1, [test_and_computed_param_0];
-; CHECK-NEXT:    mov.b32 %r1, 0;
-; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 16435;
-; CHECK-NEXT:    cvt.u32.u16 %r3, %rs1;
-; CHECK-NEXT:    prmt.b32 %r4, %r3, 0, 13120;
-; CHECK-NEXT:    prmt.b32 %r5, %r4, %r2, 21520;
-; CHECK-NEXT:    bfi.b32 %r6, 5, %r5, 8, 8;
-; CHECK-NEXT:    and.b32 %r8, %r6, %r5;
+; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
+; CHECK-NEXT:    bfi.b32 %r2, 0, %r1, 8, 8;
+; CHECK-NEXT:    bfi.b32 %r3, 0, %r2, 16, 8;
+; CHECK-NEXT:    bfi.b32 %r4, 0, %r3, 24, 8;
+; CHECK-NEXT:    bfi.b32 %r6, 5, %r4, 8, 8;
+; CHECK-NEXT:    and.b32 %r8, %r6, %r4;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r8;
 ; CHECK-NEXT:    ret;
   %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0
@@ -929,40 +926,40 @@ define <4 x i8> @test_select_cc(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8>
 ; CHECK-LABEL: test_select_cc(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<28>;
+; CHECK-NEXT:    .reg .b32 %r<29>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r4, [test_select_cc_param_3];
 ; CHECK-NEXT:    ld.param.u32 %r3, [test_select_cc_param_2];
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_select_cc_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_select_cc_param_0];
-; CHECK-NEXT:    bfe.u32 %r5, %r4, 0, 8;
-; CHECK-NEXT:    bfe.u32 %r6, %r3, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r5, %r4, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r3, 24, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p1, %r6, %r5;
-; CHECK-NEXT:    bfe.u32 %r7, %r4, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r8, %r3, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r4, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r8, %r3, 16, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p2, %r8, %r7;
-; CHECK-NEXT:    bfe.u32 %r9, %r4, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r10, %r3, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r9, %r4, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r3, 8, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p3, %r10, %r9;
-; CHECK-NEXT:    bfe.u32 %r11, %r4, 24, 8;
-; CHECK-NEXT:    bfe.u32 %r12, %r3, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r11, %r4, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r3, 0, 8;
 ; CHECK-NEXT:    setp.ne.u32 %p4, %r12, %r11;
-; CHECK-NEXT:    bfe.u32 %r13, %r2, 24, 8;
-; CHECK-NEXT:    bfe.u32 %r14, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r13, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r1, 0, 8;
 ; CHECK-NEXT:    selp.b32 %r15, %r14, %r13, %p4;
-; CHECK-NEXT:    bfe.u32 %r16, %r2, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r17, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r16, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r17, %r1, 8, 8;
 ; CHECK-NEXT:    selp.b32 %r18, %r17, %r16, %p3;
-; CHECK-NEXT:    prmt.b32 %r19, %r18, %r15, 16435;
-; CHECK-NEXT:    bfe.u32 %r20, %r2, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r21, %r1, 8, 8;
+; CHECK-NEXT:    bfi.b32 %r19, %r18, %r15, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r20, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r21, %r1, 16, 8;
 ; CHECK-NEXT:    selp.b32 %r22, %r21, %r20, %p2;
-; CHECK-NEXT:    bfe.u32 %r23, %r2, 0, 8;
-; CHECK-NEXT:    bfe.u32 %r24, %r1, 0, 8;
-; CHECK-NEXT:    selp.b32 %r25, %r24, %r23, %p1;
-; CHECK-NEXT:    prmt.b32 %r26, %r25, %r22, 13120;
-; CHECK-NEXT:    prmt.b32 %r27, %r26, %r19, 21520;
+; CHECK-NEXT:    bfi.b32 %r23, %r22, %r19, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r24, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r25, %r1, 24, 8;
+; CHECK-NEXT:    selp.b32 %r26, %r25, %r24, %p1;
+; CHECK-NEXT:    bfi.b32 %r27, %r26, %r23, 24, 8;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r27;
 ; CHECK-NEXT:    ret;
   %cc = icmp ne <4 x i8> %c, %d
@@ -1009,32 +1006,32 @@ define <4 x i8> @test_select_cc_i8_i32(<4 x i8> %a, <4 x i8> %b,
 ; CHECK-LABEL: test_select_cc_i8_i32(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<5>;
-; CHECK-NEXT:    .reg .b32 %r<26>;
+; CHECK-NEXT:    .reg .b32 %r<27>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v4.u32 {%r7, %r8, %r9, %r10}, [test_select_cc_i8_i32_param_3];
 ; CHECK-NEXT:    ld.param.v4.u32 {%r3, %r4, %r5, %r6}, [test_select_cc_i8_i32_param_2];
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_select_cc_i8_i32_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_select_cc_i8_i32_param_0];
-; CHECK-NEXT:    setp.ne.s32 %p1, %r3, %r7;
-; CHECK-NEXT:    setp.ne.s32 %p2, %r4, %r8;
-; CHECK-NEXT:    setp.ne.s32 %p3, %r5, %r9;
-; CHECK-NEXT:    setp.ne.s32 %p4, %r6, %r10;
-; CHECK-NEXT:    bfe.u32 %r11, %r2, 24, 8;
-; CHECK-NEXT:    bfe.u32 %r12, %r1, 24, 8;
+; CHECK-NEXT:    setp.ne.s32 %p1, %r6, %r10;
+; CHECK-NEXT:    setp.ne.s32 %p2, %r5, %r9;
+; CHECK-NEXT:    setp.ne.s32 %p3, %r4, %r8;
+; CHECK-NEXT:    setp.ne.s32 %p4, %r3, %r7;
+; CHECK-NEXT:    bfe.u32 %r11, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r1, 0, 8;
 ; CHECK-NEXT:    selp.b32 %r13, %r12, %r11, %p4;
-; CHECK-NEXT:    bfe.u32 %r14, %r2, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r15, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r14, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r15, %r1, 8, 8;
 ; CHECK-NEXT:    selp.b32 %r16, %r15, %r14, %p3;
-; CHECK-NEXT:    prmt.b32 %r17, %r16, %r13, 16435;
-; CHECK-NEXT:    bfe.u32 %r18, %r2, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r19, %r1, 8, 8;
+; CHECK-NEXT:    bfi.b32 %r17, %r16, %r13, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r18, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r19, %r1, 16, 8;
 ; CHECK-NEXT:    selp.b32 %r20, %r19, %r18, %p2;
-; CHECK-NEXT:    bfe.u32 %r21, %r2, 0, 8;
-; CHECK-NEXT:    bfe.u32 %r22, %r1, 0, 8;
-; CHECK-NEXT:    selp.b32 %r23, %r22, %r21, %p1;
-; CHECK-NEXT:    prmt.b32 %r24, %r23, %r20, 13120;
-; CHECK-NEXT:    prmt.b32 %r25, %r24, %r17, 21520;
+; CHECK-NEXT:    bfi.b32 %r21, %r20, %r17, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r22, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r23, %r1, 24, 8;
+; CHECK-NEXT:    selp.b32 %r24, %r23, %r22, %p1;
+; CHECK-NEXT:    bfi.b32 %r25, %r24, %r21, 24, 8;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r25;
 ; CHECK-NEXT:    ret;
                                           <4 x i32> %c, <4 x i32> %d) #0 {
@@ -1047,13 +1044,13 @@ define <4 x i8> @test_select_cc_i8_i32(<4 x i8> %a, <4 x i8> %b,
 define <4 x i8> @test_trunc_2xi32(<4 x i32> %a) #0 {
 ; CHECK-LABEL: test_trunc_2xi32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<8>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [test_trunc_2xi32_param_0];
-; CHECK-NEXT:    prmt.b32 %r5, %r3, %r4, 16435;
-; CHECK-NEXT:    prmt.b32 %r6, %r1, %r2, 13120;
-; CHECK-NEXT:    prmt.b32 %r7, %r6, %r5, 21520;
+; CHECK-NEXT:    bfi.b32 %r5, %r2, %r1, 8, 8;
+; CHECK-NEXT:    bfi.b32 %r6, %r3, %r5, 16, 8;
+; CHECK-NEXT:    bfi.b32 %r7, %r4, %r6, 24, 8;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r7;
 ; CHECK-NEXT:    ret;
   %r = trunc <4 x i32> %a to <4 x i8>
@@ -1063,19 +1060,19 @@ define <4 x i8> @test_trunc_2xi32(<4 x i32> %a) #0 {
 define <4 x i8> @test_trunc_2xi64(<4 x i64> %a) #0 {
 ; CHECK-LABEL: test_trunc_2xi64(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<8>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v2.u64 {%rd3, %rd4}, [test_trunc_2xi64_param_0+16];
 ; CHECK-NEXT:    ld.param.v2.u64 {%rd1, %rd2}, [test_trunc_2xi64_param_0];
-; CHECK-NEXT:    cvt.u32.u64 %r1, %rd4;
-; CHECK-NEXT:    cvt.u32.u64 %r2, %rd3;
-; CHECK-NEXT:    prmt.b32 %r3, %r2, %r1, 16435;
-; CHECK-NEXT:    cvt.u32.u64 %r4, %rd2;
-; CHECK-NEXT:    cvt.u32.u64 %r5, %rd1;
-; CHECK-NEXT:    prmt.b32 %r6, %r5, %r4, 13120;
-; CHECK-NEXT:    prmt.b32 %r7, %r6, %r3, 21520;
+; CHECK-NEXT:    cvt.u32.u64 %r1, %rd1;
+; CHECK-NEXT:    cvt.u32.u64 %r2, %rd2;
+; CHECK-NEXT:    bfi.b32 %r3, %r2, %r1, 8, 8;
+; CHECK-NEXT:    cvt.u32.u64 %r4, %rd3;
+; CHECK-NEXT:    bfi.b32 %r5, %r4, %r3, 16, 8;
+; CHECK-NEXT:    cvt.u32.u64 %r6, %rd4;
+; CHECK-NEXT:    bfi.b32 %r7, %r6, %r5, 24, 8;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r7;
 ; CHECK-NEXT:    ret;
   %r = trunc <4 x i64> %a to <4 x i8>
@@ -1187,16 +1184,15 @@ define <2 x half> @test_bitcast_4xi8_to_2xhalf(i8 %a) #0 {
 ; CHECK-LABEL: test_bitcast_4xi8_to_2xhalf(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-NEXT:    .reg .b32 %r<6>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u8 %rs1, [test_bitcast_4xi8_to_2xhalf_param_0];
-; CHECK-NEXT:    mov.b32 %r1, 6;
-; CHECK-NEXT:    prmt.b32 %r2, %r1, 7, 16435;
-; CHECK-NEXT:    cvt.u32.u16 %r3, %rs1;
-; CHECK-NEXT:    prmt.b32 %r4, %r3, 5, 13120;
-; CHECK-NEXT:    prmt.b32 %r5, %r4, %r2, 21520;
-; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r5;
+; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
+; CHECK-NEXT:    bfi.b32 %r2, 5, %r1, 8, 8;
+; CHECK-NEXT:    bfi.b32 %r3, 6, %r2, 16, 8;
+; CHECK-NEXT:    bfi.b32 %r4, 7, %r3, 24, 8;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r4;
 ; CHECK-NEXT:    ret;
   %ins.0 = insertelement <4 x i8> undef, i8 %a, i32 0
   %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1
@@ -1259,27 +1255,27 @@ define <4 x i8> @test_fptosi_4xhalf_to_4xi8(<4 x half> %a) #0 {
 ; CHECK-LABEL: test_fptosi_4xhalf_to_4xi8(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<13>;
-; CHECK-NEXT:    .reg .b32 %r<14>;
+; CHECK-NEXT:    .reg .b32 %r<15>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v2.u32 {%r3, %r4}, [test_fptosi_4xhalf_to_4xi8_param_0];
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r3;
 ; CHECK-NEXT:    cvt.rzi.s16.f16 %rs3, %rs2;
 ; CHECK-NEXT:    cvt.rzi.s16.f16 %rs4, %rs1;
 ; CHECK-NEXT:    mov.b32 %r5, {%rs4, %rs3};
 ; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r5;
-; CHECK-NEXT:    cvt.u32.u16 %r6, %rs6;
-; CHECK-NEXT:    cvt.u32.u16 %r7, %rs5;
-; CHECK-NEXT:    prmt.b32 %r8, %r7, %r6, 16435;
-; CHECK-NEXT:    mov.b32 {%rs7, %rs8}, %r3;
+; CHECK-NEXT:    cvt.u32.u16 %r6, %rs5;
+; CHECK-NEXT:    cvt.u32.u16 %r7, %rs6;
+; CHECK-NEXT:    bfi.b32 %r8, %r7, %r6, 8, 8;
+; CHECK-NEXT:    mov.b32 {%rs7, %rs8}, %r4;
 ; CHECK-NEXT:    cvt.rzi.s16.f16 %rs9, %rs8;
 ; CHECK-NEXT:    cvt.rzi.s16.f16 %rs10, %rs7;
 ; CHECK-NEXT:    mov.b32 %r9, {%rs10, %rs9};
 ; CHECK-NEXT:    mov.b32 {%rs11, %rs12}, %r9;
-; CHECK-NEXT:    cvt.u32.u16 %r10, %rs12;
-; CHECK-NEXT:    cvt.u32.u16 %r11, %rs11;
-; CHECK-NEXT:    prmt.b32 %r12, %r11, %r10, 13120;
-; CHECK-NEXT:    prmt.b32 %r13, %r12, %r8, 21520;
+; CHECK-NEXT:    cvt.u32.u16 %r10, %rs11;
+; CHECK-NEXT:    bfi.b32 %r11, %r10, %r8, 16, 8;
+; CHECK-NEXT:    cvt.u32.u16 %r12, %rs12;
+; CHECK-NEXT:    bfi.b32 %r13, %r12, %r11, 24, 8;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r13;
 ; CHECK-NEXT:    ret;
   %r = fptosi <4 x half> %a to <4 x i8>
@@ -1290,27 +1286,27 @@ define <4 x i8> @test_fptoui_4xhalf_to_4xi8(<4 x half> %a) #0 {
 ; CHECK-LABEL: test_fptoui_4xhalf_to_4xi8(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<13>;
-; CHECK-NEXT:    .reg .b32 %r<14>;
+; CHECK-NEXT:    .reg .b32 %r<15>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v2.u32 {%r3, %r4}, [test_fptoui_4xhalf_to_4xi8_param_0];
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r3;
 ; CHECK-NEXT:    cvt.rzi.u16.f16 %rs3, %rs2;
 ; CHECK-NEXT:    cvt.rzi.u16.f16 %rs4, %rs1;
 ; CHECK-NEXT:    mov.b32 %r5, {%rs4, %rs3};
 ; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r5;
-; CHECK-NEXT:    cvt.u32.u16 %r6, %rs6;
-; CHECK-NEXT:    cvt.u32.u16 %r7, %rs5;
-; CHECK-NEXT:    prmt.b32 %r8, %r7, %r6, 16435;
-; CHECK-NEXT:    mov.b32 {%rs7, %rs8}, %r3;
+; CHECK-NEXT:    cvt.u32.u16 %r6, %rs5;
+; CHECK-NEXT:    cvt.u32.u16 %r7, %rs6;
+; CHECK-NEXT:    bfi.b32 %r8, %r7, %r6, 8, 8;
+; CHECK-NEXT:    mov.b32 {%rs7, %rs8}, %r4;
 ; CHECK-NEXT:    cvt.rzi.u16.f16 %rs9, %rs8;
 ; CHECK-NEXT:    cvt.rzi.u16.f16 %rs10, %rs7;
 ; CHECK-NEXT:    mov.b32 %r9, {%rs10, %rs9};
 ; CHECK-NEXT:    mov.b32 {%rs11, %rs12}, %r9;
-; CHECK-NEXT:    cvt.u32.u16 %r10, %rs12;
-; CHECK-NEXT:    cvt.u32.u16 %r11, %rs11;
-; CHECK-NEXT:    prmt.b32 %r12, %r11, %r10, 13120;
-; CHECK-NEXT:    prmt.b32 %r13, %r12, %r8, 21520;
+; CHECK-NEXT:    cvt.u32.u16 %r10, %rs11;
+; CHECK-NEXT:    bfi.b32 %r11, %r10, %r8, 16, 8;
+; CHECK-NEXT:    cvt.u32.u16 %r12, %rs12;
+; CHECK-NEXT:    bfi.b32 %r13, %r12, %r11, 24, 8;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r13;
 ; CHECK-NEXT:    ret;
   %r = fptoui <4 x half> %a to <4 x i8>
@@ -1330,33 +1326,33 @@ define void @test_srem_v4i8(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    ld.param.u64 %rd1, [test_srem_v4i8_param_0];
 ; CHECK-NEXT:    ld.u32 %r1, [%rd1];
 ; CHECK-NEXT:    ld.u32 %r2, [%rd2];
-; CHECK-NEXT:    bfe.s32 %r3, %r2, 24, 8;
+; CHECK-NEXT:    bfe.s32 %r3, %r2, 0, 8;
 ; CHECK-NEXT:    cvt.s8.s32 %rs1, %r3;
-; CHECK-NEXT:    bfe.s32 %r4, %r1, 24, 8;
+; CHECK-NEXT:    bfe.s32 %r4, %r1, 0, 8;
 ; CHECK-NEXT:    cvt.s8.s32 %rs2, %r4;
 ; CHECK-NEXT:    rem.s16 %rs3, %rs2, %rs1;
 ; CHECK-NEXT:    cvt.u32.u16 %r5, %rs3;
-; CHECK-NEXT:    bfe.s32 %r6, %r2, 16, 8;
+; CHECK-NEXT:    bfe.s32 %r6, %r2, 8, 8;
 ; CHECK-NEXT:    cvt.s8.s32 %rs4, %r6;
-; CHECK-NEXT:    bfe.s32 %r7, %r1, 16, 8;
+; CHECK-NEXT:    bfe.s32 %r7, %r1, 8, 8;
 ; CHECK-NEXT:    cvt.s8.s32 %rs5, %r7;
 ; CHECK-NEXT:    rem.s16 %rs6, %rs5, %rs4;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT:    prmt.b32 %r9, %r8, %r5, 16435;
-; CHECK-NEXT:    bfe.s32 %r10, %r2, 8, 8;
+; CHECK-NEXT:    bfi.b32 %r9, %r8, %r5, 8, 8;
+; CHECK-NEXT:    bfe.s32 %r10, %r2, 16, 8;
 ; CHECK-NEXT:    cvt.s8.s32 %rs7, %r10;
-; CHECK-NEXT:    bfe.s32 %r11, %r1, 8, 8;
+; CHECK-NEXT:    bfe.s32 %r11, %r1, 16, 8;
 ; CHECK-NEXT:    cvt.s8.s32 %rs8, %r11;
 ; CHECK-NEXT:    rem.s16 %rs9, %rs8, %rs7;
 ; CHECK-NEXT:    cvt.u32.u16 %r12, %rs9;
-; CHECK-NEXT:    bfe.s32 %r13, %r2, 0, 8;
-; CHECK-NEXT:    cvt.s8.s32 %rs10, %r13;
-; CHECK-NEXT:    bfe.s32 %r14, %r1, 0, 8;
-; CHECK-NEXT:    cvt.s8.s32 %rs11, %r14;
+; CHECK-NEXT:    bfi.b32 %r13, %r12, %r9, 16, 8;
+; CHECK-NEXT:    bfe.s32 %r14, %r2, 24, 8;
+; CHECK-NEXT:    cvt.s8.s32 %rs10, %r14;
+; CHECK-NEXT:    bfe.s32 %r15, %r1, 24, 8;
+; CHECK-NEXT:    cvt.s8.s32 %rs11, %r15;
 ; CHECK-NEXT:    rem.s16 %rs12, %rs11, %rs10;
-; CHECK-NEXT:    cvt.u32.u16 %r15, %rs12;
-; CHECK-NEXT:    prmt.b32 %r16, %r15, %r12, 13120;
-; CHECK-NEXT:    prmt.b32 %r17, %r16, %r9, 21520;
+; CHECK-NEXT:    cvt.u32.u16 %r16, %rs12;
+; CHECK-NEXT:    bfi.b32 %r17, %r16, %r13, 24, 8;
 ; CHECK-NEXT:    st.u32 [%rd3], %r17;
 ; CHECK-NEXT:    ret;
 entry:
@@ -1377,7 +1373,7 @@ define void @test_srem_v3i8(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: test_srem_v3i8(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<20>;
-; CHECK-NEXT:    .reg .b32 %r<17>;
+; CHECK-NEXT:    .reg .b32 %r<16>;
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
@@ -1396,25 +1392,25 @@ define void @test_srem_v3i8(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    or.b16 %rs9, %rs8, %rs6;
 ; CHECK-NEXT:    cvt.u32.u16 %r3, %rs9;
 ; CHECK-NEXT:    ld.s8 %rs10, [%rd2+2];
-; CHECK-NEXT:    bfe.s32 %r5, %r3, 8, 8;
+; CHECK-NEXT:    bfe.s32 %r5, %r3, 0, 8;
 ; CHECK-NEXT:    cvt.s8.s32 %rs11, %r5;
-; CHECK-NEXT:    bfe.s32 %r6, %r1, 8, 8;
+; CHECK-NEXT:    bfe.s32 %r6, %r1, 0, 8;
 ; CHECK-NEXT:    cvt.s8.s32 %rs12, %r6;
 ; CHECK-NEXT:    rem.s16 %rs13, %rs12, %rs11;
 ; CHECK-NEXT:    cvt.u32.u16 %r7, %rs13;
-; CHECK-NEXT:    bfe.s32 %r8, %r3, 0, 8;
+; CHECK-NEXT:    bfe.s32 %r8, %r3, 8, 8;
 ; CHECK-NEXT:    cvt.s8.s32 %rs14, %r8;
-; CHECK-NEXT:    bfe.s32 %r9, %r1, 0, 8;
+; CHECK-NEXT:    bfe.s32 %r9, %r1, 8, 8;
 ; CHECK-NEXT:    cvt.s8.s32 %rs15, %r9;
 ; CHECK-NEXT:    rem.s16 %rs16, %rs15, %rs14;
 ; CHECK-NEXT:    cvt.u32.u16 %r10, %rs16;
-; CHECK-NEXT:    prmt.b32 %r11, %r10, %r7, 13120;
+; CHECK-NEXT:    bfi.b32 %r11, %r10, %r7, 8, 8;
 ; CHECK-NEXT:    // implicit-def: %r13
-; CHECK-NEXT:    // implicit-def: %r14
-; CHECK-NEXT:    prmt.b32 %r12, %r13, %r14, 16435;
-; CHECK-NEXT:    prmt.b32 %r15, %r11, %r12, 21520;
+; CHECK-NEXT:    bfi.b32 %r12, %r13, %r11, 16, 8;
+; CHECK-NEXT:    // implicit-def: %r15
+; CHECK-NEXT:    bfi.b32 %r14, %r15, %r12, 24, 8;
 ; CHECK-NEXT:    rem.s16 %rs17, %rs5, %rs10;
-; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {%rs18, tmp}, %r15; }
+; CHECK-NEXT:    cvt.u16.u32 %rs18, %r14;
 ; CHECK-NEXT:    st.u8 [%rd3], %rs18;
 ; CHECK-NEXT:    shr.u16 %rs19, %rs18, 8;
 ; CHECK-NEXT:    st.u8 [%rd3+1], %rs19;
@@ -1441,25 +1437,25 @@ define void @test_sext_v4i1_to_v4i8(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    ld.param.u64 %rd1, [test_sext_v4i1_to_v4i8_param_0];
 ; CHECK-NEXT:    ld.u32 %r1, [%rd1];
 ; CHECK-NEXT:    ld.u32 %r2, [%rd2];
-; CHECK-NEXT:    bfe.u32 %r3, %r2, 0, 8;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 24, 8;
 ; CHECK-NEXT:    setp.hi.u32 %p1, %r4, %r3;
-; CHECK-NEXT:    bfe.u32 %r5, %r2, 8, 8;
-; CHECK-NEXT:    bfe.u32 %r6, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r5, %r2, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r6, %r1, 16, 8;
 ; CHECK-NEXT:    setp.hi.u32 %p2, %r6, %r5;
-; CHECK-NEXT:    bfe.u32 %r7, %r2, 16, 8;
-; CHECK-NEXT:    bfe.u32 %r8, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r7, %r2, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r8, %r1, 8, 8;
 ; CHECK-NEXT:    setp.hi.u32 %p3, %r8, %r7;
-; CHECK-NEXT:    bfe.u32 %r9, %r2, 24, 8;
-; CHECK-NEXT:    bfe.u32 %r10, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r9, %r2, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r10, %r1, 0, 8;
 ; CHECK-NEXT:    setp.hi.u32 %p4, %r10, %r9;
 ; CHECK-NEXT:    selp.s32 %r11, -1, 0, %p4;
 ; CHECK-NEXT:    selp.s32 %r12, -1, 0, %p3;
-; CHECK-NEXT:    prmt.b32 %r13, %r12, %r11, 16435;
+; CHECK-NEXT:    bfi.b32 %r13, %r12, %r11, 8, 8;
 ; CHECK-NEXT:    selp.s32 %r14, -1, 0, %p2;
-; CHECK-NEXT:    selp.s32 %r15, -1, 0, %p1;
-; CHECK-NEXT:    prmt.b32 %r16, %r15, %r14, 13120;
-; CHECK-NEXT:    prmt.b32 %r17, %r16, %r13, 21520;
+; CHECK-NEXT:    bfi.b32 %r15, %r14, %r13, 16, 8;
+; CHECK-NEXT:    selp.s32 %r16, -1, 0, %p1;
+; CHECK-NEXT:    bfi.b32 %r17, %r16, %r15, 24, 8;
 ; CHECK-NEXT:    st.u32 [%rd3], %r17;
 ; CHECK-NEXT:    ret;
 entry:
diff --git a/llvm/test/CodeGen/NVPTX/sext-setcc.ll b/llvm/test/CodeGen/NVPTX/sext-setcc.ll
index 8b7e5235443f05..f471d47077cf0d 100644
--- a/llvm/test/CodeGen/NVPTX/sext-setcc.ll
+++ b/llvm/test/CodeGen/NVPTX/sext-setcc.ll
@@ -33,35 +33,35 @@ define <4 x i8> @sext_setcc_v4i1_to_v4i8(ptr %p) {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<5>;
 ; CHECK-NEXT:    .reg .b16 %rs<9>;
-; CHECK-NEXT:    .reg .b32 %r<13>;
+; CHECK-NEXT:    .reg .b32 %r<14>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    ld.param.u64 %rd1, [sext_setcc_v4i1_to_v4i8_param_0];
 ; CHECK-NEXT:    ld.u32 %r1, [%rd1];
-; CHECK-NEXT:    bfe.u32 %r2, %r1, 0, 8;
+; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
 ; CHECK-NEXT:    and.b16 %rs2, %rs1, 255;
 ; CHECK-NEXT:    setp.eq.s16 %p1, %rs2, 0;
-; CHECK-NEXT:    bfe.u32 %r3, %r1, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r3, %r1, 16, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs3, %r3;
 ; CHECK-NEXT:    and.b16 %rs4, %rs3, 255;
 ; CHECK-NEXT:    setp.eq.s16 %p2, %rs4, 0;
-; CHECK-NEXT:    bfe.u32 %r4, %r1, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r4, %r1, 8, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs5, %r4;
 ; CHECK-NEXT:    and.b16 %rs6, %rs5, 255;
 ; CHECK-NEXT:    setp.eq.s16 %p3, %rs6, 0;
-; CHECK-NEXT:    bfe.u32 %r5, %r1, 24, 8;
+; CHECK-NEXT:    bfe.u32 %r5, %r1, 0, 8;
 ; CHECK-NEXT:    cvt.u16.u32 %rs7, %r5;
 ; CHECK-NEXT:    and.b16 %rs8, %rs7, 255;
 ; CHECK-NEXT:    setp.eq.s16 %p4, %rs8, 0;
 ; CHECK-NEXT:    selp.s32 %r6, -1, 0, %p4;
 ; CHECK-NEXT:    selp.s32 %r7, -1, 0, %p3;
-; CHECK-NEXT:    prmt.b32 %r8, %r7, %r6, 16435;
+; CHECK-NEXT:    bfi.b32 %r8, %r7, %r6, 8, 8;
 ; CHECK-NEXT:    selp.s32 %r9, -1, 0, %p2;
-; CHECK-NEXT:    selp.s32 %r10, -1, 0, %p1;
-; CHECK-NEXT:    prmt.b32 %r11, %r10, %r9, 13120;
-; CHECK-NEXT:    prmt.b32 %r12, %r11, %r8, 21520;
+; CHECK-NEXT:    bfi.b32 %r10, %r9, %r8, 16, 8;
+; CHECK-NEXT:    selp.s32 %r11, -1, 0, %p1;
+; CHECK-NEXT:    bfi.b32 %r12, %r11, %r10, 24, 8;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r12;
 ; CHECK-NEXT:    ret;
 entry:

From c8da2253f9aa4dff039e9ed766ff0f865632a0eb Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Fri, 11 Oct 2024 05:45:09 -0700
Subject: [PATCH 164/177] [Clang] Replace Intrinsic::getDeclaration with
 getOrInsertDeclaration (#111990)

Fix build failure from the rename change. Looks like one additional
reference sneaked in between pre-commit checks and the commit itself.
---
 clang/lib/CodeGen/CGBuiltin.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 059c75fae284dd..465afd04740d89 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18882,7 +18882,7 @@ case Builtin::BI__builtin_hlsl_elementwise_isinf: {
     // for the DirectX intrinsic and the demangled builtin name
     switch (CGM.getTarget().getTriple().getArch()) {
     case llvm::Triple::dxil:
-      return EmitRuntimeCall(Intrinsic::getDeclaration(
+      return EmitRuntimeCall(Intrinsic::getOrInsertDeclaration(
           &CGM.getModule(), Intrinsic::dx_wave_getlaneindex));
     case llvm::Triple::spirv:
       return EmitRuntimeCall(CGM.CreateRuntimeFunction(

From ed7251b3aeb7c471dc50e9409e83a9ec01f40df5 Mon Sep 17 00:00:00 2001
From: Mikhail Goncharov <goncharov.mikhail@gmail.com>
Date: Fri, 11 Oct 2024 14:46:46 +0200
Subject: [PATCH 165/177] Revert "[clang] Implement TTP P0522 pack matching for
 deduced function template calls. (#111457)"

See discussion in https://github.com/llvm/llvm-project/pull/111711

This reverts commit 4dadf42c1a74dd4e37db9ffd6fbb3027f59751a7.
---
 clang/include/clang/Sema/Overload.h      | 10 +--
 clang/include/clang/Sema/Sema.h          | 23 +++----
 clang/lib/Sema/SemaLookup.cpp            |  1 -
 clang/lib/Sema/SemaOverload.cpp          | 50 ++++++---------
 clang/lib/Sema/SemaTemplate.cpp          | 23 ++++---
 clang/lib/Sema/SemaTemplateDeduction.cpp | 70 +++++++++-----------
 clang/test/SemaTemplate/cwg2398.cpp      | 81 ------------------------
 7 files changed, 69 insertions(+), 189 deletions(-)

diff --git a/clang/include/clang/Sema/Overload.h b/clang/include/clang/Sema/Overload.h
index d38278c5041118..c716a25bb673b8 100644
--- a/clang/include/clang/Sema/Overload.h
+++ b/clang/include/clang/Sema/Overload.h
@@ -925,11 +925,6 @@ class Sema;
 
     bool TookAddressOfOverload : 1;
 
-    /// Have we matched any packs on the parameter side, versus any non-packs on
-    /// the argument side, in a context where the opposite matching is also
-    /// allowed?
-    bool HasMatchedPackOnParmToNonPackOnArg : 1;
-
     /// True if the candidate was found using ADL.
     CallExpr::ADLCallKind IsADLCandidate : 1;
 
@@ -1004,9 +999,8 @@ class Sema;
     friend class OverloadCandidateSet;
     OverloadCandidate()
         : IsSurrogate(false), IgnoreObjectArgument(false),
-          TookAddressOfOverload(false),
-          HasMatchedPackOnParmToNonPackOnArg(false),
-          IsADLCandidate(CallExpr::NotADL), RewriteKind(CRK_None) {}
+          TookAddressOfOverload(false), IsADLCandidate(CallExpr::NotADL),
+          RewriteKind(CRK_None) {}
   };
 
   /// OverloadCandidateSet - A set of overload candidates, used in C++
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index f8118ca64ad3f2..66b0846f286a81 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -10134,8 +10134,7 @@ class Sema final : public SemaBase {
       ADLCallKind IsADLCandidate = ADLCallKind::NotADL,
       ConversionSequenceList EarlyConversions = std::nullopt,
       OverloadCandidateParamOrder PO = {},
-      bool AggregateCandidateDeduction = false,
-      bool HasMatchedPackOnParmToNonPackOnArg = false);
+      bool AggregateCandidateDeduction = false);
 
   /// Add all of the function declarations in the given function set to
   /// the overload candidate set.
@@ -10170,8 +10169,7 @@ class Sema final : public SemaBase {
                      bool SuppressUserConversions = false,
                      bool PartialOverloading = false,
                      ConversionSequenceList EarlyConversions = std::nullopt,
-                     OverloadCandidateParamOrder PO = {},
-                     bool HasMatchedPackOnParmToNonPackOnArg = false);
+                     OverloadCandidateParamOrder PO = {});
 
   /// Add a C++ member function template as a candidate to the candidate
   /// set, using template argument deduction to produce an appropriate member
@@ -10217,8 +10215,7 @@ class Sema final : public SemaBase {
       CXXConversionDecl *Conversion, DeclAccessPair FoundDecl,
       CXXRecordDecl *ActingContext, Expr *From, QualType ToType,
       OverloadCandidateSet &CandidateSet, bool AllowObjCConversionOnExplicit,
-      bool AllowExplicit, bool AllowResultConversion = true,
-      bool HasMatchedPackOnParmToNonPackOnArg = false);
+      bool AllowExplicit, bool AllowResultConversion = true);
 
   /// Adds a conversion function template specialization
   /// candidate to the overload set, using template argument deduction
@@ -11641,7 +11638,7 @@ class Sema final : public SemaBase {
                         SourceLocation RAngleLoc, unsigned ArgumentPackIndex,
                         SmallVectorImpl<TemplateArgument> &SugaredConverted,
                         SmallVectorImpl<TemplateArgument> &CanonicalConverted,
-                        CheckTemplateArgumentKind CTAK, bool PartialOrdering,
+                        CheckTemplateArgumentKind CTAK,
                         bool *MatchedPackOnParmToNonPackOnArg);
 
   /// Check that the given template arguments can be provided to
@@ -11724,8 +11721,7 @@ class Sema final : public SemaBase {
   /// It returns true if an error occurred, and false otherwise.
   bool CheckTemplateTemplateArgument(TemplateTemplateParmDecl *Param,
                                      TemplateParameterList *Params,
-                                     TemplateArgumentLoc &Arg,
-                                     bool PartialOrdering,
+                                     TemplateArgumentLoc &Arg, bool IsDeduced,
                                      bool *MatchedPackOnParmToNonPackOnArg);
 
   void NoteTemplateLocation(const NamedDecl &Decl,
@@ -12237,8 +12233,8 @@ class Sema final : public SemaBase {
       SmallVectorImpl<DeducedTemplateArgument> &Deduced,
       unsigned NumExplicitlySpecified, FunctionDecl *&Specialization,
       sema::TemplateDeductionInfo &Info,
-      SmallVectorImpl<OriginalCallArg> const *OriginalCallArgs,
-      bool PartialOverloading, bool PartialOrdering,
+      SmallVectorImpl<OriginalCallArg> const *OriginalCallArgs = nullptr,
+      bool PartialOverloading = false,
       llvm::function_ref<bool()> CheckNonDependent = [] { return false; });
 
   /// Perform template argument deduction from a function call
@@ -12272,8 +12268,7 @@ class Sema final : public SemaBase {
       TemplateArgumentListInfo *ExplicitTemplateArgs, ArrayRef<Expr *> Args,
       FunctionDecl *&Specialization, sema::TemplateDeductionInfo &Info,
       bool PartialOverloading, bool AggregateDeductionCandidate,
-      bool PartialOrdering, QualType ObjectType,
-      Expr::Classification ObjectClassification,
+      QualType ObjectType, Expr::Classification ObjectClassification,
       llvm::function_ref<bool(ArrayRef<QualType>)> CheckNonDependent);
 
   /// Deduce template arguments when taking the address of a function
@@ -12428,7 +12423,7 @@ class Sema final : public SemaBase {
   bool isTemplateTemplateParameterAtLeastAsSpecializedAs(
       TemplateParameterList *PParam, TemplateDecl *PArg, TemplateDecl *AArg,
       const DefaultArguments &DefaultArgs, SourceLocation ArgLoc,
-      bool PartialOrdering, bool *MatchedPackOnParmToNonPackOnArg);
+      bool IsDeduced, bool *MatchedPackOnParmToNonPackOnArg);
 
   /// Mark which template parameters are used in a given expression.
   ///
diff --git a/clang/lib/Sema/SemaLookup.cpp b/clang/lib/Sema/SemaLookup.cpp
index 60fa195221c938..31422c213ac249 100644
--- a/clang/lib/Sema/SemaLookup.cpp
+++ b/clang/lib/Sema/SemaLookup.cpp
@@ -3667,7 +3667,6 @@ Sema::LookupLiteralOperator(Scope *S, LookupResult &R,
           if (CheckTemplateArgument(
                   Params->getParam(0), Arg, FD, R.getNameLoc(), R.getNameLoc(),
                   0, SugaredChecked, CanonicalChecked, CTAK_Specified,
-                  /*PartialOrdering=*/false,
                   /*MatchedPackOnParmToNonPackOnArg=*/nullptr) ||
               Trap.hasErrorOccurred())
             IsTemplate = false;
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index f545e9341e1ae6..2cde8131108fbe 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -6864,8 +6864,7 @@ void Sema::AddOverloadCandidate(
     OverloadCandidateSet &CandidateSet, bool SuppressUserConversions,
     bool PartialOverloading, bool AllowExplicit, bool AllowExplicitConversions,
     ADLCallKind IsADLCandidate, ConversionSequenceList EarlyConversions,
-    OverloadCandidateParamOrder PO, bool AggregateCandidateDeduction,
-    bool HasMatchedPackOnParmToNonPackOnArg) {
+    OverloadCandidateParamOrder PO, bool AggregateCandidateDeduction) {
   const FunctionProtoType *Proto
     = dyn_cast<FunctionProtoType>(Function->getType()->getAs<FunctionType>());
   assert(Proto && "Functions without a prototype cannot be overloaded");
@@ -6884,8 +6883,7 @@ void Sema::AddOverloadCandidate(
       AddMethodCandidate(Method, FoundDecl, Method->getParent(), QualType(),
                          Expr::Classification::makeSimpleLValue(), Args,
                          CandidateSet, SuppressUserConversions,
-                         PartialOverloading, EarlyConversions, PO,
-                         HasMatchedPackOnParmToNonPackOnArg);
+                         PartialOverloading, EarlyConversions, PO);
       return;
     }
     // We treat a constructor like a non-member function, since its object
@@ -6928,8 +6926,6 @@ void Sema::AddOverloadCandidate(
       CandidateSet.getRewriteInfo().getRewriteKind(Function, PO);
   Candidate.IsADLCandidate = IsADLCandidate;
   Candidate.ExplicitCallArguments = Args.size();
-  Candidate.HasMatchedPackOnParmToNonPackOnArg =
-      HasMatchedPackOnParmToNonPackOnArg;
 
   // Explicit functions are not actually candidates at all if we're not
   // allowing them in this context, but keep them around so we can point
@@ -7457,13 +7453,16 @@ void Sema::AddMethodCandidate(DeclAccessPair FoundDecl, QualType ObjectType,
   }
 }
 
-void Sema::AddMethodCandidate(
-    CXXMethodDecl *Method, DeclAccessPair FoundDecl,
-    CXXRecordDecl *ActingContext, QualType ObjectType,
-    Expr::Classification ObjectClassification, ArrayRef<Expr *> Args,
-    OverloadCandidateSet &CandidateSet, bool SuppressUserConversions,
-    bool PartialOverloading, ConversionSequenceList EarlyConversions,
-    OverloadCandidateParamOrder PO, bool HasMatchedPackOnParmToNonPackOnArg) {
+void
+Sema::AddMethodCandidate(CXXMethodDecl *Method, DeclAccessPair FoundDecl,
+                         CXXRecordDecl *ActingContext, QualType ObjectType,
+                         Expr::Classification ObjectClassification,
+                         ArrayRef<Expr *> Args,
+                         OverloadCandidateSet &CandidateSet,
+                         bool SuppressUserConversions,
+                         bool PartialOverloading,
+                         ConversionSequenceList EarlyConversions,
+                         OverloadCandidateParamOrder PO) {
   const FunctionProtoType *Proto
     = dyn_cast<FunctionProtoType>(Method->getType()->getAs<FunctionType>());
   assert(Proto && "Methods without a prototype cannot be overloaded");
@@ -7494,8 +7493,6 @@ void Sema::AddMethodCandidate(
   Candidate.TookAddressOfOverload =
       CandidateSet.getKind() == OverloadCandidateSet::CSK_AddressOfOverloadSet;
   Candidate.ExplicitCallArguments = Args.size();
-  Candidate.HasMatchedPackOnParmToNonPackOnArg =
-      HasMatchedPackOnParmToNonPackOnArg;
 
   bool IgnoreExplicitObject =
       (Method->isExplicitObjectMemberFunction() &&
@@ -7666,8 +7663,8 @@ void Sema::AddMethodTemplateCandidate(
   ConversionSequenceList Conversions;
   if (TemplateDeductionResult Result = DeduceTemplateArguments(
           MethodTmpl, ExplicitTemplateArgs, Args, Specialization, Info,
-          PartialOverloading, /*AggregateDeductionCandidate=*/false,
-          /*PartialOrdering=*/false, ObjectType, ObjectClassification,
+          PartialOverloading, /*AggregateDeductionCandidate=*/false, ObjectType,
+          ObjectClassification,
           [&](ArrayRef<QualType> ParamTypes) {
             return CheckNonDependentConversions(
                 MethodTmpl, ParamTypes, Args, CandidateSet, Conversions,
@@ -7705,8 +7702,7 @@ void Sema::AddMethodTemplateCandidate(
   AddMethodCandidate(cast<CXXMethodDecl>(Specialization), FoundDecl,
                      ActingContext, ObjectType, ObjectClassification, Args,
                      CandidateSet, SuppressUserConversions, PartialOverloading,
-                     Conversions, PO,
-                     Info.hasMatchedPackOnParmToNonPackOnArg());
+                     Conversions, PO);
 }
 
 /// Determine whether a given function template has a simple explicit specifier
@@ -7752,7 +7748,6 @@ void Sema::AddTemplateOverloadCandidate(
   if (TemplateDeductionResult Result = DeduceTemplateArguments(
           FunctionTemplate, ExplicitTemplateArgs, Args, Specialization, Info,
           PartialOverloading, AggregateCandidateDeduction,
-          /*PartialOrdering=*/false,
           /*ObjectType=*/QualType(),
           /*ObjectClassification=*/Expr::Classification(),
           [&](ArrayRef<QualType> ParamTypes) {
@@ -7793,8 +7788,7 @@ void Sema::AddTemplateOverloadCandidate(
       Specialization, FoundDecl, Args, CandidateSet, SuppressUserConversions,
       PartialOverloading, AllowExplicit,
       /*AllowExplicitConversions=*/false, IsADLCandidate, Conversions, PO,
-      Info.AggregateDeductionCandidateHasMismatchedArity,
-      Info.hasMatchedPackOnParmToNonPackOnArg());
+      Info.AggregateDeductionCandidateHasMismatchedArity);
 }
 
 bool Sema::CheckNonDependentConversions(
@@ -7916,8 +7910,7 @@ void Sema::AddConversionCandidate(
     CXXConversionDecl *Conversion, DeclAccessPair FoundDecl,
     CXXRecordDecl *ActingContext, Expr *From, QualType ToType,
     OverloadCandidateSet &CandidateSet, bool AllowObjCConversionOnExplicit,
-    bool AllowExplicit, bool AllowResultConversion,
-    bool HasMatchedPackOnParmToNonPackOnArg) {
+    bool AllowExplicit, bool AllowResultConversion) {
   assert(!Conversion->getDescribedFunctionTemplate() &&
          "Conversion function templates use AddTemplateConversionCandidate");
   QualType ConvType = Conversion->getConversionType().getNonReferenceType();
@@ -7962,8 +7955,6 @@ void Sema::AddConversionCandidate(
   Candidate.FinalConversion.setAllToTypes(ToType);
   Candidate.Viable = true;
   Candidate.ExplicitCallArguments = 1;
-  Candidate.HasMatchedPackOnParmToNonPackOnArg =
-      HasMatchedPackOnParmToNonPackOnArg;
 
   // Explicit functions are not actually candidates at all if we're not
   // allowing them in this context, but keep them around so we can point
@@ -8165,8 +8156,7 @@ void Sema::AddTemplateConversionCandidate(
   assert(Specialization && "Missing function template specialization?");
   AddConversionCandidate(Specialization, FoundDecl, ActingDC, From, ToType,
                          CandidateSet, AllowObjCConversionOnExplicit,
-                         AllowExplicit, AllowResultConversion,
-                         Info.hasMatchedPackOnParmToNonPackOnArg());
+                         AllowExplicit, AllowResultConversion);
 }
 
 void Sema::AddSurrogateCandidate(CXXConversionDecl *Conversion,
@@ -10519,10 +10509,6 @@ bool clang::isBetterOverloadCandidate(
           isa<CXXConstructorDecl>(Cand2.Function))
     return isa<CXXConstructorDecl>(Cand1.Function);
 
-  if (Cand1.HasMatchedPackOnParmToNonPackOnArg !=
-      Cand2.HasMatchedPackOnParmToNonPackOnArg)
-    return Cand2.HasMatchedPackOnParmToNonPackOnArg;
-
   //    -- F1 is a non-template function and F2 is a function template
   //       specialization, or, if not that,
   bool Cand1IsSpecialization = Cand1.Function &&
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 62d0d0914fa306..4f13669c2490c0 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -5179,8 +5179,7 @@ bool Sema::CheckTemplateArgument(
     unsigned ArgumentPackIndex,
     SmallVectorImpl<TemplateArgument> &SugaredConverted,
     SmallVectorImpl<TemplateArgument> &CanonicalConverted,
-    CheckTemplateArgumentKind CTAK, bool PartialOrdering,
-    bool *MatchedPackOnParmToNonPackOnArg) {
+    CheckTemplateArgumentKind CTAK, bool *MatchedPackOnParmToNonPackOnArg) {
   // Check template type parameters.
   if (TemplateTypeParmDecl *TTP = dyn_cast<TemplateTypeParmDecl>(Param))
     return CheckTemplateTypeArgument(TTP, Arg, SugaredConverted,
@@ -5395,7 +5394,8 @@ bool Sema::CheckTemplateArgument(
 
   case TemplateArgument::Template:
   case TemplateArgument::TemplateExpansion:
-    if (CheckTemplateTemplateArgument(TempParm, Params, Arg, PartialOrdering,
+    if (CheckTemplateTemplateArgument(TempParm, Params, Arg,
+                                      /*IsDeduced=*/CTAK != CTAK_Specified,
                                       MatchedPackOnParmToNonPackOnArg))
       return true;
 
@@ -5546,11 +5546,10 @@ bool Sema::CheckTemplateArgumentList(
 
     if (ArgIdx < NumArgs) {
       // Check the template argument we were given.
-      if (CheckTemplateArgument(*Param, NewArgs[ArgIdx], Template, TemplateLoc,
-                                RAngleLoc, SugaredArgumentPack.size(),
-                                SugaredConverted, CanonicalConverted,
-                                CTAK_Specified, /*PartialOrdering=*/false,
-                                MatchedPackOnParmToNonPackOnArg))
+      if (CheckTemplateArgument(
+              *Param, NewArgs[ArgIdx], Template, TemplateLoc, RAngleLoc,
+              SugaredArgumentPack.size(), SugaredConverted, CanonicalConverted,
+              CTAK_Specified, MatchedPackOnParmToNonPackOnArg))
         return true;
 
       CanonicalConverted.back().setIsDefaulted(
@@ -5708,7 +5707,7 @@ bool Sema::CheckTemplateArgumentList(
     // Check the default template argument.
     if (CheckTemplateArgument(*Param, Arg, Template, TemplateLoc, RAngleLoc, 0,
                               SugaredConverted, CanonicalConverted,
-                              CTAK_Specified, /*PartialOrdering=*/false,
+                              CTAK_Specified,
                               /*MatchedPackOnParmToNonPackOnArg=*/nullptr))
       return true;
 
@@ -7294,7 +7293,7 @@ static void DiagnoseTemplateParameterListArityMismatch(
 
 bool Sema::CheckTemplateTemplateArgument(
     TemplateTemplateParmDecl *Param, TemplateParameterList *Params,
-    TemplateArgumentLoc &Arg, bool PartialOrdering,
+    TemplateArgumentLoc &Arg, bool IsDeduced,
     bool *MatchedPackOnParmToNonPackOnArg) {
   TemplateName Name = Arg.getArgument().getAsTemplateOrTemplatePattern();
   auto [Template, DefaultArgs] = Name.getTemplateDeclAndDefaultArgs();
@@ -7339,8 +7338,8 @@ bool Sema::CheckTemplateTemplateArgument(
   //   A template-argument matches a template template-parameter P when P
   //   is at least as specialized as the template-argument A.
   if (!isTemplateTemplateParameterAtLeastAsSpecializedAs(
-          Params, Param, Template, DefaultArgs, Arg.getLocation(),
-          PartialOrdering, MatchedPackOnParmToNonPackOnArg))
+          Params, Param, Template, DefaultArgs, Arg.getLocation(), IsDeduced,
+          MatchedPackOnParmToNonPackOnArg))
     return true;
   // P2113
   // C++20[temp.func.order]p2
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index e49d315f7186bc..48a39a90f72a8b 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -2955,7 +2955,7 @@ Sema::getIdentityTemplateArgumentLoc(NamedDecl *TemplateParm,
 /// fully-converted template arguments.
 static bool ConvertDeducedTemplateArgument(
     Sema &S, NamedDecl *Param, DeducedTemplateArgument Arg, NamedDecl *Template,
-    TemplateDeductionInfo &Info, bool IsDeduced, bool PartialOrdering,
+    TemplateDeductionInfo &Info, bool IsDeduced,
     SmallVectorImpl<TemplateArgument> &SugaredOutput,
     SmallVectorImpl<TemplateArgument> &CanonicalOutput) {
   auto ConvertArg = [&](DeducedTemplateArgument Arg,
@@ -2976,7 +2976,7 @@ static bool ConvertDeducedTemplateArgument(
             ? (Arg.wasDeducedFromArrayBound() ? Sema::CTAK_DeducedFromArrayBound
                                               : Sema::CTAK_Deduced)
             : Sema::CTAK_Specified,
-        PartialOrdering, &MatchedPackOnParmToNonPackOnArg);
+        &MatchedPackOnParmToNonPackOnArg);
     if (MatchedPackOnParmToNonPackOnArg)
       Info.setMatchedPackOnParmToNonPackOnArg();
     return Res;
@@ -3062,9 +3062,9 @@ static TemplateDeductionResult ConvertDeducedTemplateArguments(
     SmallVectorImpl<DeducedTemplateArgument> &Deduced,
     TemplateDeductionInfo &Info,
     SmallVectorImpl<TemplateArgument> &SugaredBuilder,
-    SmallVectorImpl<TemplateArgument> &CanonicalBuilder, bool PartialOrdering,
-    LocalInstantiationScope *CurrentInstantiationScope,
-    unsigned NumAlreadyConverted, bool *IsIncomplete) {
+    SmallVectorImpl<TemplateArgument> &CanonicalBuilder,
+    LocalInstantiationScope *CurrentInstantiationScope = nullptr,
+    unsigned NumAlreadyConverted = 0, bool *IsIncomplete = nullptr) {
   TemplateParameterList *TemplateParams = Template->getTemplateParameters();
 
   for (unsigned I = 0, N = TemplateParams->size(); I != N; ++I) {
@@ -3107,8 +3107,8 @@ static TemplateDeductionResult ConvertDeducedTemplateArguments(
       // We may have deduced this argument, so it still needs to be
       // checked and converted.
       if (ConvertDeducedTemplateArgument(S, Param, Deduced[I], Template, Info,
-                                         IsDeduced, PartialOrdering,
-                                         SugaredBuilder, CanonicalBuilder)) {
+                                         IsDeduced, SugaredBuilder,
+                                         CanonicalBuilder)) {
         Info.Param = makeTemplateParameter(Param);
         // FIXME: These template arguments are temporary. Free them!
         Info.reset(
@@ -3174,8 +3174,7 @@ static TemplateDeductionResult ConvertDeducedTemplateArguments(
     // Check whether we can actually use the default argument.
     if (S.CheckTemplateArgument(
             Param, DefArg, TD, TD->getLocation(), TD->getSourceRange().getEnd(),
-            /*ArgumentPackIndex=*/0, SugaredBuilder, CanonicalBuilder,
-            Sema::CTAK_Specified, /*PartialOrdering=*/false,
+            0, SugaredBuilder, CanonicalBuilder, Sema::CTAK_Specified,
             /*MatchedPackOnParmToNonPackOnArg=*/nullptr)) {
       Info.Param = makeTemplateParameter(
                          const_cast<NamedDecl *>(TemplateParams->getParam(I)));
@@ -3284,9 +3283,7 @@ FinishTemplateArgumentDeduction(
   SmallVector<TemplateArgument, 4> SugaredBuilder, CanonicalBuilder;
   if (auto Result = ConvertDeducedTemplateArguments(
           S, Partial, IsPartialOrdering, Deduced, Info, SugaredBuilder,
-          CanonicalBuilder, IsPartialOrdering,
-          /*CurrentInstantiationScope=*/nullptr, /*NumAlreadyConverted=*/0,
-          /*IsIncomplete=*/nullptr);
+          CanonicalBuilder);
       Result != TemplateDeductionResult::Success)
     return Result;
 
@@ -3386,10 +3383,10 @@ static TemplateDeductionResult FinishTemplateArgumentDeduction(
   //   explicitly specified, template argument deduction fails.
   SmallVector<TemplateArgument, 4> SugaredBuilder, CanonicalBuilder;
   if (auto Result = ConvertDeducedTemplateArguments(
-          S, Template, /*IsDeduced=*/PartialOrdering, Deduced, Info,
-          SugaredBuilder, CanonicalBuilder, PartialOrdering,
+          S, Template, /*IsDeduced*/ PartialOrdering, Deduced, Info,
+          SugaredBuilder, CanonicalBuilder,
           /*CurrentInstantiationScope=*/nullptr,
-          /*NumAlreadyConverted=*/0U, /*IsIncomplete=*/nullptr);
+          /*NumAlreadyConverted=*/0U);
       Result != TemplateDeductionResult::Success)
     return Result;
 
@@ -3454,9 +3451,7 @@ static TemplateDeductionResult FinishTemplateArgumentDeduction(
   SmallVector<TemplateArgument, 4> SugaredBuilder, CanonicalBuilder;
   if (auto Result = ConvertDeducedTemplateArguments(
           S, TD, /*IsDeduced=*/false, Deduced, Info, SugaredBuilder,
-          CanonicalBuilder, /*PartialOrdering=*/false,
-          /*CurrentInstantiationScope=*/nullptr, /*NumAlreadyConverted=*/0,
-          /*IsIncomplete=*/nullptr);
+          CanonicalBuilder);
       Result != TemplateDeductionResult::Success)
     return Result;
 
@@ -3994,8 +3989,7 @@ TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
     unsigned NumExplicitlySpecified, FunctionDecl *&Specialization,
     TemplateDeductionInfo &Info,
     SmallVectorImpl<OriginalCallArg> const *OriginalCallArgs,
-    bool PartialOverloading, bool PartialOrdering,
-    llvm::function_ref<bool()> CheckNonDependent) {
+    bool PartialOverloading, llvm::function_ref<bool()> CheckNonDependent) {
   // Unevaluated SFINAE context.
   EnterExpressionEvaluationContext Unevaluated(
       *this, Sema::ExpressionEvaluationContext::Unevaluated);
@@ -4018,10 +4012,9 @@ TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
   bool IsIncomplete = false;
   SmallVector<TemplateArgument, 4> SugaredBuilder, CanonicalBuilder;
   if (auto Result = ConvertDeducedTemplateArguments(
-          *this, FunctionTemplate, /*IsDeduced=*/true, Deduced, Info,
-          SugaredBuilder, CanonicalBuilder, PartialOrdering,
-          CurrentInstantiationScope, NumExplicitlySpecified,
-          PartialOverloading ? &IsIncomplete : nullptr);
+          *this, FunctionTemplate, /*IsDeduced*/ true, Deduced, Info,
+          SugaredBuilder, CanonicalBuilder, CurrentInstantiationScope,
+          NumExplicitlySpecified, PartialOverloading ? &IsIncomplete : nullptr);
       Result != TemplateDeductionResult::Success)
     return Result;
 
@@ -4553,8 +4546,7 @@ TemplateDeductionResult Sema::DeduceTemplateArguments(
     TemplateArgumentListInfo *ExplicitTemplateArgs, ArrayRef<Expr *> Args,
     FunctionDecl *&Specialization, TemplateDeductionInfo &Info,
     bool PartialOverloading, bool AggregateDeductionCandidate,
-    bool PartialOrdering, QualType ObjectType,
-    Expr::Classification ObjectClassification,
+    QualType ObjectType, Expr::Classification ObjectClassification,
     llvm::function_ref<bool(ArrayRef<QualType>)> CheckNonDependent) {
   if (FunctionTemplate->isInvalidDecl())
     return TemplateDeductionResult::Invalid;
@@ -4769,8 +4761,7 @@ TemplateDeductionResult Sema::DeduceTemplateArguments(
   runWithSufficientStackSpace(Info.getLocation(), [&] {
     Result = FinishTemplateArgumentDeduction(
         FunctionTemplate, Deduced, NumExplicitlySpecified, Specialization, Info,
-        &OriginalCallArgs, PartialOverloading, PartialOrdering,
-        [&, CallingCtx]() {
+        &OriginalCallArgs, PartialOverloading, [&, CallingCtx]() {
           ContextRAII SavedContext(*this, CallingCtx);
           return CheckNonDependent(ParamTypesForArgChecking);
         });
@@ -4882,10 +4873,9 @@ TemplateDeductionResult Sema::DeduceTemplateArguments(
 
   TemplateDeductionResult Result;
   runWithSufficientStackSpace(Info.getLocation(), [&] {
-    Result = FinishTemplateArgumentDeduction(
-        FunctionTemplate, Deduced, NumExplicitlySpecified, Specialization, Info,
-        /*OriginalCallArgs=*/nullptr, /*PartialOverloading=*/false,
-        /*PartialOrdering=*/true);
+    Result = FinishTemplateArgumentDeduction(FunctionTemplate, Deduced,
+                                             NumExplicitlySpecified,
+                                             Specialization, Info);
   });
   if (Result != TemplateDeductionResult::Success)
     return Result;
@@ -5065,10 +5055,9 @@ TemplateDeductionResult Sema::DeduceTemplateArguments(
   FunctionDecl *ConversionSpecialized = nullptr;
   TemplateDeductionResult Result;
   runWithSufficientStackSpace(Info.getLocation(), [&] {
-    Result = FinishTemplateArgumentDeduction(
-        ConversionTemplate, Deduced, 0, ConversionSpecialized, Info,
-        &OriginalCallArgs, /*PartialOverloading=*/false,
-        /*PartialOrdering=*/false);
+    Result = FinishTemplateArgumentDeduction(ConversionTemplate, Deduced, 0,
+                                             ConversionSpecialized, Info,
+                                             &OriginalCallArgs);
   });
   Specialization = cast_or_null<CXXConversionDecl>(ConversionSpecialized);
   return Result;
@@ -5645,8 +5634,7 @@ static TemplateDeductionResult FinishTemplateArgumentDeduction(
   SmallVector<TemplateArgument, 4> SugaredBuilder, CanonicalBuilder;
   if (auto Result = ConvertDeducedTemplateArguments(
           S, FTD, /*IsDeduced=*/true, Deduced, Info, SugaredBuilder,
-          CanonicalBuilder, /*PartialOrdering=*/true,
-          /*CurrentInstantiationScope=*/nullptr,
+          CanonicalBuilder, /*CurrentInstantiationScope=*/nullptr,
           /*NumAlreadyConverted=*/0, &IsIncomplete);
       Result != TemplateDeductionResult::Success)
     return Result;
@@ -6491,8 +6479,8 @@ bool Sema::isMoreSpecializedThanPrimary(
 
 bool Sema::isTemplateTemplateParameterAtLeastAsSpecializedAs(
     TemplateParameterList *P, TemplateDecl *PArg, TemplateDecl *AArg,
-    const DefaultArguments &DefaultArgs, SourceLocation ArgLoc,
-    bool PartialOrdering, bool *MatchedPackOnParmToNonPackOnArg) {
+    const DefaultArguments &DefaultArgs, SourceLocation ArgLoc, bool IsDeduced,
+    bool *MatchedPackOnParmToNonPackOnArg) {
   // C++1z [temp.arg.template]p4: (DR 150)
   //   A template template-parameter P is at least as specialized as a
   //   template template-argument A if, given the following rewrite to two
@@ -6571,7 +6559,7 @@ bool Sema::isTemplateTemplateParameterAtLeastAsSpecializedAs(
   switch (::DeduceTemplateArguments(
       *this, A, AArgs, PArgs, Info, Deduced,
       /*NumberOfArgumentsMustMatch=*/false, /*PartialOrdering=*/true,
-      PartialOrdering ? PackFold::ArgumentToParameter : PackFold::Both,
+      IsDeduced ? PackFold::ArgumentToParameter : PackFold::Both,
       /*HasDeducedAnyParam=*/nullptr)) {
   case clang::TemplateDeductionResult::Success:
     if (MatchedPackOnParmToNonPackOnArg &&
diff --git a/clang/test/SemaTemplate/cwg2398.cpp b/clang/test/SemaTemplate/cwg2398.cpp
index 3825239de4a285..56091e84cf4e95 100644
--- a/clang/test/SemaTemplate/cwg2398.cpp
+++ b/clang/test/SemaTemplate/cwg2398.cpp
@@ -405,87 +405,6 @@ namespace packs {
   } // namespace t4
 } // namespace packs
 
-namespace fun_tmpl_call {
-  namespace match_func {
-    template <template <class> class TT> void f(TT<int>) {};
-    // old-note@-1 {{has different template parameters}}
-    template <class...> struct A {};
-    void test() { f(A<int>()); }
-    // old-error@-1 {{no matching function for call to 'f'}}
-  } // namespace match_func
-  namespace order_func_nonpack {
-    template <template <class> class TT> void f(TT<int>) {}
-    template <template <class...> class TT> void f(TT<int>) = delete;
-
-    template <class> struct A {};
-    void test() { f(A<int>()); }
-  } // namespace order_func_nonpack
-  namespace order_func_pack {
-    template <template <class> class TT> void f(TT<int>) = delete;
-    template <template <class...> class TT> void f(TT<int>) {}
-
-    template <class...> struct A {};
-    void test() { f(A<int>()); }
-  } // namespace order_func_pack
-  namespace match_method {
-    struct A {
-      template <template <class> class TT> void f(TT<int>) {};
-      // old-note@-1 {{has different template parameters}}
-    };
-    template <class...> struct B {};
-    void test() { A().f(B<int>()); }
-    // old-error@-1 {{no matching member function for call to 'f'}}
-  } // namespace t2
-  namespace order_method_nonpack {
-    struct A {
-      template <template <class> class TT> void f(TT<int>) {}
-      template <template <class...> class TT> void f(TT<int>) = delete;
-    };
-    template <class> struct B {};
-    void test() { A().f(B<int>()); }
-  } // namespace order_method_nonpack
-  namespace order_method_pack {
-    struct A {
-      template <template <class> class TT> void f(TT<int>) = delete;
-      template <template <class...> class TT> void f(TT<int>) {}
-    };
-    template <class...> struct B {};
-    void test() { A().f(B<int>()); }
-  } // namespace order_method_pack
-  namespace match_conv {
-    struct A {
-      template <template <class> class TT> operator TT<int>() { return {}; }
-      // old-note@-1 {{different template parameters}}
-    };
-    template <class...> struct B {};
-    // old-note@-1 2{{not viable}}
-    void test() { B<int> b = A(); }
-    // old-error@-1 {{no viable conversion from 'A' to 'B<int>'}}
-  } // namespace match_conv
-  namespace order_conv_nonpack {
-    struct A {
-      template <template <class> class TT> operator TT<int>() { return {}; };
-      template <template <class...> class TT> operator TT<int>() = delete;
-    };
-    template <class> struct B {};
-    void test() { B<int> b = A(); }
-  } // namespace order_conv_nonpack
-  namespace order_conv_pack {
-    struct A {
-      template <template <class> class TT> operator TT<int>() = delete;
-      template <template <class...> class TT> operator TT<int>() { return {}; }
-    };
-    template <class...> struct B {};
-    void test() { B<int> b = A(); }
-  } // namespace order_conv_pack
-  namespace regression1 {
-    template <template <class, class...> class TT, class T1, class... T2s>
-    void f(TT<T1, T2s...>) {}
-    template <class> struct A {};
-    void test() { f(A<int>()); }
-  } // namespace regression1
-} // namespace fun_tmpl_packs
-
 namespace partial {
   namespace t1 {
     template<template<class... T1s> class TT1> struct A {};

From bdd46cc6b74eeed14936f1373bbb6446e09979fe Mon Sep 17 00:00:00 2001
From: Mikhail Goncharov <goncharov.mikhail@gmail.com>
Date: Fri, 11 Oct 2024 14:47:21 +0200
Subject: [PATCH 166/177] Revert "[clang] CWG2398: improve overload resolution
 backwards compat (#107350)"

See discussion in https://github.com/llvm/llvm-project/pull/111711

This reverts commit 224519b08945637a85e9798c78286643288f7b77.
---
 clang/docs/ReleaseNotes.rst                  |  3 +-
 clang/include/clang/Sema/Sema.h              | 14 +++----
 clang/include/clang/Sema/TemplateDeduction.h | 13 ------
 clang/lib/Sema/SemaLookup.cpp                |  3 +-
 clang/lib/Sema/SemaTemplate.cpp              | 44 ++++++++------------
 clang/lib/Sema/SemaTemplateDeduction.cpp     | 43 ++++++-------------
 clang/lib/Sema/SemaTemplateInstantiate.cpp   | 24 +++++------
 clang/test/SemaTemplate/cwg2398.cpp          |  6 ++-
 8 files changed, 53 insertions(+), 97 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index df165b91252505..00376ce2a6008b 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -207,8 +207,7 @@ Resolutions to C++ Defect Reports
   (`CWG2351: void{} <https://cplusplus.github.io/CWG/issues/2351.html>`_).
 
 - Clang now has improved resolution to CWG2398, allowing class templates to have
-  default arguments deduced when partial ordering, and better backwards compatibility
-  in overload resolution.
+  default arguments deduced when partial ordering.
 
 - Clang now allows comparing unequal object pointers that have been cast to ``void *``
   in constant expressions. These comparisons always worked in non-constant expressions.
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 66b0846f286a81..49c593bf88c989 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -11638,8 +11638,7 @@ class Sema final : public SemaBase {
                         SourceLocation RAngleLoc, unsigned ArgumentPackIndex,
                         SmallVectorImpl<TemplateArgument> &SugaredConverted,
                         SmallVectorImpl<TemplateArgument> &CanonicalConverted,
-                        CheckTemplateArgumentKind CTAK,
-                        bool *MatchedPackOnParmToNonPackOnArg);
+                        CheckTemplateArgumentKind CTAK);
 
   /// Check that the given template arguments can be provided to
   /// the given template, converting the arguments along the way.
@@ -11686,8 +11685,7 @@ class Sema final : public SemaBase {
       SmallVectorImpl<TemplateArgument> &SugaredConverted,
       SmallVectorImpl<TemplateArgument> &CanonicalConverted,
       bool UpdateArgsWithConversions = true,
-      bool *ConstraintsNotSatisfied = nullptr, bool PartialOrderingTTP = false,
-      bool *MatchedPackOnParmToNonPackOnArg = nullptr);
+      bool *ConstraintsNotSatisfied = nullptr, bool PartialOrderingTTP = false);
 
   bool CheckTemplateTypeArgument(
       TemplateTypeParmDecl *Param, TemplateArgumentLoc &Arg,
@@ -11721,8 +11719,7 @@ class Sema final : public SemaBase {
   /// It returns true if an error occurred, and false otherwise.
   bool CheckTemplateTemplateArgument(TemplateTemplateParmDecl *Param,
                                      TemplateParameterList *Params,
-                                     TemplateArgumentLoc &Arg, bool IsDeduced,
-                                     bool *MatchedPackOnParmToNonPackOnArg);
+                                     TemplateArgumentLoc &Arg, bool IsDeduced);
 
   void NoteTemplateLocation(const NamedDecl &Decl,
                             std::optional<SourceRange> ParamRange = {});
@@ -12423,7 +12420,7 @@ class Sema final : public SemaBase {
   bool isTemplateTemplateParameterAtLeastAsSpecializedAs(
       TemplateParameterList *PParam, TemplateDecl *PArg, TemplateDecl *AArg,
       const DefaultArguments &DefaultArgs, SourceLocation ArgLoc,
-      bool IsDeduced, bool *MatchedPackOnParmToNonPackOnArg);
+      bool IsDeduced);
 
   /// Mark which template parameters are used in a given expression.
   ///
@@ -13422,8 +13419,7 @@ class Sema final : public SemaBase {
   bool InstantiateClassTemplateSpecialization(
       SourceLocation PointOfInstantiation,
       ClassTemplateSpecializationDecl *ClassTemplateSpec,
-      TemplateSpecializationKind TSK, bool Complain = true,
-      bool PrimaryHasMatchedPackOnParmToNonPackOnArg = false);
+      TemplateSpecializationKind TSK, bool Complain = true);
 
   /// Instantiates the definitions of all of the member
   /// of the given class, which is an instantiation of a class template
diff --git a/clang/include/clang/Sema/TemplateDeduction.h b/clang/include/clang/Sema/TemplateDeduction.h
index 9c12eef5c42a06..28b014fd84e4b3 100644
--- a/clang/include/clang/Sema/TemplateDeduction.h
+++ b/clang/include/clang/Sema/TemplateDeduction.h
@@ -51,11 +51,6 @@ class TemplateDeductionInfo {
   /// Have we suppressed an error during deduction?
   bool HasSFINAEDiagnostic = false;
 
-  /// Have we matched any packs on the parameter side, versus any non-packs on
-  /// the argument side, in a context where the opposite matching is also
-  /// allowed?
-  bool MatchedPackOnParmToNonPackOnArg = false;
-
   /// The template parameter depth for which we're performing deduction.
   unsigned DeducedDepth;
 
@@ -92,14 +87,6 @@ class TemplateDeductionInfo {
     return DeducedDepth;
   }
 
-  bool hasMatchedPackOnParmToNonPackOnArg() const {
-    return MatchedPackOnParmToNonPackOnArg;
-  }
-
-  void setMatchedPackOnParmToNonPackOnArg() {
-    MatchedPackOnParmToNonPackOnArg = true;
-  }
-
   /// Get the number of explicitly-specified arguments.
   unsigned getNumExplicitArgs() const {
     return ExplicitArgs;
diff --git a/clang/lib/Sema/SemaLookup.cpp b/clang/lib/Sema/SemaLookup.cpp
index 31422c213ac249..f3f62474d06441 100644
--- a/clang/lib/Sema/SemaLookup.cpp
+++ b/clang/lib/Sema/SemaLookup.cpp
@@ -3666,8 +3666,7 @@ Sema::LookupLiteralOperator(Scope *S, LookupResult &R,
           TemplateArgumentLoc Arg(TemplateArgument(StringLit), StringLit);
           if (CheckTemplateArgument(
                   Params->getParam(0), Arg, FD, R.getNameLoc(), R.getNameLoc(),
-                  0, SugaredChecked, CanonicalChecked, CTAK_Specified,
-                  /*MatchedPackOnParmToNonPackOnArg=*/nullptr) ||
+                  0, SugaredChecked, CanonicalChecked, CTAK_Specified) ||
               Trap.hasErrorOccurred())
             IsTemplate = false;
         }
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 4f13669c2490c0..226c1172a059d4 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -5179,7 +5179,7 @@ bool Sema::CheckTemplateArgument(
     unsigned ArgumentPackIndex,
     SmallVectorImpl<TemplateArgument> &SugaredConverted,
     SmallVectorImpl<TemplateArgument> &CanonicalConverted,
-    CheckTemplateArgumentKind CTAK, bool *MatchedPackOnParmToNonPackOnArg) {
+    CheckTemplateArgumentKind CTAK) {
   // Check template type parameters.
   if (TemplateTypeParmDecl *TTP = dyn_cast<TemplateTypeParmDecl>(Param))
     return CheckTemplateTypeArgument(TTP, Arg, SugaredConverted,
@@ -5395,8 +5395,7 @@ bool Sema::CheckTemplateArgument(
   case TemplateArgument::Template:
   case TemplateArgument::TemplateExpansion:
     if (CheckTemplateTemplateArgument(TempParm, Params, Arg,
-                                      /*IsDeduced=*/CTAK != CTAK_Specified,
-                                      MatchedPackOnParmToNonPackOnArg))
+                                      /*IsDeduced=*/CTAK != CTAK_Specified))
       return true;
 
     SugaredConverted.push_back(Arg.getArgument());
@@ -5470,7 +5469,7 @@ bool Sema::CheckTemplateArgumentList(
     SmallVectorImpl<TemplateArgument> &SugaredConverted,
     SmallVectorImpl<TemplateArgument> &CanonicalConverted,
     bool UpdateArgsWithConversions, bool *ConstraintsNotSatisfied,
-    bool PartialOrderingTTP, bool *MatchedPackOnParmToNonPackOnArg) {
+    bool PartialOrderingTTP) {
 
   if (ConstraintsNotSatisfied)
     *ConstraintsNotSatisfied = false;
@@ -5546,10 +5545,10 @@ bool Sema::CheckTemplateArgumentList(
 
     if (ArgIdx < NumArgs) {
       // Check the template argument we were given.
-      if (CheckTemplateArgument(
-              *Param, NewArgs[ArgIdx], Template, TemplateLoc, RAngleLoc,
-              SugaredArgumentPack.size(), SugaredConverted, CanonicalConverted,
-              CTAK_Specified, MatchedPackOnParmToNonPackOnArg))
+      if (CheckTemplateArgument(*Param, NewArgs[ArgIdx], Template, TemplateLoc,
+                                RAngleLoc, SugaredArgumentPack.size(),
+                                SugaredConverted, CanonicalConverted,
+                                CTAK_Specified))
         return true;
 
       CanonicalConverted.back().setIsDefaulted(
@@ -5707,8 +5706,7 @@ bool Sema::CheckTemplateArgumentList(
     // Check the default template argument.
     if (CheckTemplateArgument(*Param, Arg, Template, TemplateLoc, RAngleLoc, 0,
                               SugaredConverted, CanonicalConverted,
-                              CTAK_Specified,
-                              /*MatchedPackOnParmToNonPackOnArg=*/nullptr))
+                              CTAK_Specified))
       return true;
 
     SugaredConverted.back().setIsDefaulted(true);
@@ -7291,10 +7289,10 @@ static void DiagnoseTemplateParameterListArityMismatch(
     Sema &S, TemplateParameterList *New, TemplateParameterList *Old,
     Sema::TemplateParameterListEqualKind Kind, SourceLocation TemplateArgLoc);
 
-bool Sema::CheckTemplateTemplateArgument(
-    TemplateTemplateParmDecl *Param, TemplateParameterList *Params,
-    TemplateArgumentLoc &Arg, bool IsDeduced,
-    bool *MatchedPackOnParmToNonPackOnArg) {
+bool Sema::CheckTemplateTemplateArgument(TemplateTemplateParmDecl *Param,
+                                         TemplateParameterList *Params,
+                                         TemplateArgumentLoc &Arg,
+                                         bool IsDeduced) {
   TemplateName Name = Arg.getArgument().getAsTemplateOrTemplatePattern();
   auto [Template, DefaultArgs] = Name.getTemplateDeclAndDefaultArgs();
   if (!Template) {
@@ -7338,8 +7336,7 @@ bool Sema::CheckTemplateTemplateArgument(
   //   A template-argument matches a template template-parameter P when P
   //   is at least as specialized as the template-argument A.
   if (!isTemplateTemplateParameterAtLeastAsSpecializedAs(
-          Params, Param, Template, DefaultArgs, Arg.getLocation(), IsDeduced,
-          MatchedPackOnParmToNonPackOnArg))
+          Params, Param, Template, DefaultArgs, Arg.getLocation(), IsDeduced))
     return true;
   // P2113
   // C++20[temp.func.order]p2
@@ -9757,14 +9754,11 @@ DeclResult Sema::ActOnExplicitInstantiation(
 
   // Check that the template argument list is well-formed for this
   // template.
-  bool PrimaryHasMatchedPackOnParmToNonPackOnArg = false;
   SmallVector<TemplateArgument, 4> SugaredConverted, CanonicalConverted;
-  if (CheckTemplateArgumentList(
-          ClassTemplate, TemplateNameLoc, TemplateArgs,
-          /*DefaultArgs=*/{}, false, SugaredConverted, CanonicalConverted,
-          /*UpdateArgsWithConversions=*/true,
-          /*ConstraintsNotSatisfied=*/nullptr, /*PartialOrderingTTP=*/false,
-          &PrimaryHasMatchedPackOnParmToNonPackOnArg))
+  if (CheckTemplateArgumentList(ClassTemplate, TemplateNameLoc, TemplateArgs,
+                                /*DefaultArgs=*/{}, false, SugaredConverted,
+                                CanonicalConverted,
+                                /*UpdateArgsWithConversions=*/true))
     return true;
 
   // Find the class template specialization declaration that
@@ -9885,9 +9879,7 @@ DeclResult Sema::ActOnExplicitInstantiation(
     = cast_or_null<ClassTemplateSpecializationDecl>(
                                               Specialization->getDefinition());
   if (!Def)
-    InstantiateClassTemplateSpecialization(
-        TemplateNameLoc, Specialization, TSK,
-        /*Complain=*/true, PrimaryHasMatchedPackOnParmToNonPackOnArg);
+    InstantiateClassTemplateSpecialization(TemplateNameLoc, Specialization, TSK);
   else if (TSK == TSK_ExplicitInstantiationDefinition) {
     MarkVTableUsed(TemplateNameLoc, Specialization, true);
     Specialization->setPointOfInstantiation(Def->getPointOfInstantiation());
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index 48a39a90f72a8b..03ff8145e3b4ac 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -2767,12 +2767,8 @@ DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
     for (; hasTemplateArgumentForDeduction(As, ArgIdx) &&
            PackScope.hasNextElement();
          ++ArgIdx) {
-      if (!As[ArgIdx].isPackExpansion()) {
-        if (!FoldPackParameter)
-          return TemplateDeductionResult::MiscellaneousDeductionFailure;
-        if (FoldPackArgument)
-          Info.setMatchedPackOnParmToNonPackOnArg();
-      }
+      if (!FoldPackParameter && !As[ArgIdx].isPackExpansion())
+        return TemplateDeductionResult::MiscellaneousDeductionFailure;
       // Deduce template arguments from the pattern.
       if (auto Result = DeduceTemplateArguments(
               S, TemplateParams, Pattern, As[ArgIdx], Info, PartialOrdering,
@@ -2966,20 +2962,15 @@ static bool ConvertDeducedTemplateArgument(
     TemplateArgumentLoc ArgLoc = S.getTrivialTemplateArgumentLoc(
         Arg, QualType(), Info.getLocation(), Param);
 
-    bool MatchedPackOnParmToNonPackOnArg = false;
     // Check the template argument, converting it as necessary.
-    auto Res = S.CheckTemplateArgument(
+    return S.CheckTemplateArgument(
         Param, ArgLoc, Template, Template->getLocation(),
         Template->getSourceRange().getEnd(), ArgumentPackIndex, SugaredOutput,
         CanonicalOutput,
         IsDeduced
             ? (Arg.wasDeducedFromArrayBound() ? Sema::CTAK_DeducedFromArrayBound
                                               : Sema::CTAK_Deduced)
-            : Sema::CTAK_Specified,
-        &MatchedPackOnParmToNonPackOnArg);
-    if (MatchedPackOnParmToNonPackOnArg)
-      Info.setMatchedPackOnParmToNonPackOnArg();
-    return Res;
+            : Sema::CTAK_Specified);
   };
 
   if (Arg.getKind() == TemplateArgument::Pack) {
@@ -3174,8 +3165,7 @@ static TemplateDeductionResult ConvertDeducedTemplateArguments(
     // Check whether we can actually use the default argument.
     if (S.CheckTemplateArgument(
             Param, DefArg, TD, TD->getLocation(), TD->getSourceRange().getEnd(),
-            0, SugaredBuilder, CanonicalBuilder, Sema::CTAK_Specified,
-            /*MatchedPackOnParmToNonPackOnArg=*/nullptr)) {
+            0, SugaredBuilder, CanonicalBuilder, Sema::CTAK_Specified)) {
       Info.Param = makeTemplateParameter(
                          const_cast<NamedDecl *>(TemplateParams->getParam(I)));
       // FIXME: These template arguments are temporary. Free them!
@@ -3324,20 +3314,16 @@ FinishTemplateArgumentDeduction(
     return TemplateDeductionResult::SubstitutionFailure;
   }
 
-  bool MatchedPackOnParmToNonPackOnArg = false;
   bool ConstraintsNotSatisfied;
   SmallVector<TemplateArgument, 4> SugaredConvertedInstArgs,
       CanonicalConvertedInstArgs;
   if (S.CheckTemplateArgumentList(
           Template, Partial->getLocation(), InstArgs, /*DefaultArgs=*/{}, false,
           SugaredConvertedInstArgs, CanonicalConvertedInstArgs,
-          /*UpdateArgsWithConversions=*/true, &ConstraintsNotSatisfied,
-          /*PartialOrderingTTP=*/false, &MatchedPackOnParmToNonPackOnArg))
+          /*UpdateArgsWithConversions=*/true, &ConstraintsNotSatisfied))
     return ConstraintsNotSatisfied
                ? TemplateDeductionResult::ConstraintsNotSatisfied
                : TemplateDeductionResult::SubstitutionFailure;
-  if (MatchedPackOnParmToNonPackOnArg)
-    Info.setMatchedPackOnParmToNonPackOnArg();
 
   TemplateParameterList *TemplateParams = Template->getTemplateParameters();
   for (unsigned I = 0, E = TemplateParams->size(); I != E; ++I) {
@@ -6479,8 +6465,8 @@ bool Sema::isMoreSpecializedThanPrimary(
 
 bool Sema::isTemplateTemplateParameterAtLeastAsSpecializedAs(
     TemplateParameterList *P, TemplateDecl *PArg, TemplateDecl *AArg,
-    const DefaultArguments &DefaultArgs, SourceLocation ArgLoc, bool IsDeduced,
-    bool *MatchedPackOnParmToNonPackOnArg) {
+    const DefaultArguments &DefaultArgs, SourceLocation ArgLoc,
+    bool IsDeduced) {
   // C++1z [temp.arg.template]p4: (DR 150)
   //   A template template-parameter P is at least as specialized as a
   //   template template-argument A if, given the following rewrite to two
@@ -6532,11 +6518,11 @@ bool Sema::isTemplateTemplateParameterAtLeastAsSpecializedAs(
     //   If the rewrite produces an invalid type, then P is not at least as
     //   specialized as A.
     SmallVector<TemplateArgument, 4> CanonicalPArgs;
-    if (CheckTemplateArgumentList(
-            AArg, ArgLoc, PArgList, DefaultArgs, false, PArgs, CanonicalPArgs,
-            /*UpdateArgsWithConversions=*/true,
-            /*ConstraintsNotSatisfied=*/nullptr,
-            /*PartialOrderingTTP=*/true, MatchedPackOnParmToNonPackOnArg))
+    if (CheckTemplateArgumentList(AArg, ArgLoc, PArgList, DefaultArgs, false,
+                                  PArgs, CanonicalPArgs,
+                                  /*UpdateArgsWithConversions=*/true,
+                                  /*ConstraintsNotSatisfied=*/nullptr,
+                                  /*PartialOrderingTTP=*/true))
       return false;
   }
 
@@ -6562,9 +6548,6 @@ bool Sema::isTemplateTemplateParameterAtLeastAsSpecializedAs(
       IsDeduced ? PackFold::ArgumentToParameter : PackFold::Both,
       /*HasDeducedAnyParam=*/nullptr)) {
   case clang::TemplateDeductionResult::Success:
-    if (MatchedPackOnParmToNonPackOnArg &&
-        Info.hasMatchedPackOnParmToNonPackOnArg())
-      *MatchedPackOnParmToNonPackOnArg = true;
     break;
 
   case TemplateDeductionResult::MiscellaneousDeductionFailure:
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index 2f60c0beb22e73..de71774bd8e559 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -4017,11 +4017,11 @@ bool Sema::usesPartialOrExplicitSpecialization(
 /// Get the instantiation pattern to use to instantiate the definition of a
 /// given ClassTemplateSpecializationDecl (either the pattern of the primary
 /// template or of a partial specialization).
-static ActionResult<CXXRecordDecl *> getPatternForClassTemplateSpecialization(
+static ActionResult<CXXRecordDecl *>
+getPatternForClassTemplateSpecialization(
     Sema &S, SourceLocation PointOfInstantiation,
     ClassTemplateSpecializationDecl *ClassTemplateSpec,
-    TemplateSpecializationKind TSK,
-    bool PrimaryHasMatchedPackOnParmToNonPackOnArg) {
+    TemplateSpecializationKind TSK) {
   Sema::InstantiatingTemplate Inst(S, PointOfInstantiation, ClassTemplateSpec);
   if (Inst.isInvalid())
     return {/*Invalid=*/true};
@@ -4044,7 +4044,7 @@ static ActionResult<CXXRecordDecl *> getPatternForClassTemplateSpecialization(
     //   specialization with the template argument lists of the partial
     //   specializations.
     typedef PartialSpecMatchResult MatchResult;
-    SmallVector<MatchResult, 4> Matched, ExtraMatched;
+    SmallVector<MatchResult, 4> Matched;
     SmallVector<ClassTemplatePartialSpecializationDecl *, 4> PartialSpecs;
     Template->getPartialSpecializations(PartialSpecs);
     TemplateSpecCandidateSet FailedCandidates(PointOfInstantiation);
@@ -4061,13 +4061,11 @@ static ActionResult<CXXRecordDecl *> getPatternForClassTemplateSpecialization(
             MakeDeductionFailureInfo(S.Context, Result, Info));
         (void)Result;
       } else {
-        auto &List =
-            Info.hasMatchedPackOnParmToNonPackOnArg() ? ExtraMatched : Matched;
-        List.push_back(MatchResult{Partial, Info.takeCanonical()});
+        Matched.push_back(PartialSpecMatchResult());
+        Matched.back().Partial = Partial;
+        Matched.back().Args = Info.takeCanonical();
       }
     }
-    if (Matched.empty() && PrimaryHasMatchedPackOnParmToNonPackOnArg)
-      Matched = std::move(ExtraMatched);
 
     // If we're dealing with a member template where the template parameters
     // have been instantiated, this provides the original template parameters
@@ -4170,8 +4168,7 @@ static ActionResult<CXXRecordDecl *> getPatternForClassTemplateSpecialization(
 bool Sema::InstantiateClassTemplateSpecialization(
     SourceLocation PointOfInstantiation,
     ClassTemplateSpecializationDecl *ClassTemplateSpec,
-    TemplateSpecializationKind TSK, bool Complain,
-    bool PrimaryHasMatchedPackOnParmToNonPackOnArg) {
+    TemplateSpecializationKind TSK, bool Complain) {
   // Perform the actual instantiation on the canonical declaration.
   ClassTemplateSpec = cast<ClassTemplateSpecializationDecl>(
       ClassTemplateSpec->getCanonicalDecl());
@@ -4179,9 +4176,8 @@ bool Sema::InstantiateClassTemplateSpecialization(
     return true;
 
   ActionResult<CXXRecordDecl *> Pattern =
-      getPatternForClassTemplateSpecialization(
-          *this, PointOfInstantiation, ClassTemplateSpec, TSK,
-          PrimaryHasMatchedPackOnParmToNonPackOnArg);
+      getPatternForClassTemplateSpecialization(*this, PointOfInstantiation,
+                                               ClassTemplateSpec, TSK);
   if (!Pattern.isUsable())
     return Pattern.isInvalid();
 
diff --git a/clang/test/SemaTemplate/cwg2398.cpp b/clang/test/SemaTemplate/cwg2398.cpp
index 56091e84cf4e95..b9e9e9f0c97f26 100644
--- a/clang/test/SemaTemplate/cwg2398.cpp
+++ b/clang/test/SemaTemplate/cwg2398.cpp
@@ -156,14 +156,16 @@ namespace ttp_defaults {
 namespace ttp_only {
   template <template <class...    > class TT1> struct A      { static constexpr int V = 0; };
   template <template <class       > class TT2> struct A<TT2> { static constexpr int V = 1; };
+  // new-note@-1 {{partial specialization matches}}
   template <template <class, class> class TT3> struct A<TT3> { static constexpr int V = 2; };
+  // new-note@-1 {{partial specialization matches}}
 
   template <class ...          > struct B;
   template <class              > struct C;
   template <class, class       > struct D;
   template <class, class, class> struct E;
 
-  static_assert(A<B>::V == 0);
+  static_assert(A<B>::V == 0); // new-error {{ambiguous partial specializations}}
   static_assert(A<C>::V == 1);
   static_assert(A<D>::V == 2);
   static_assert(A<E>::V == 0);
@@ -410,9 +412,11 @@ namespace partial {
     template<template<class... T1s> class TT1> struct A {};
 
     template<template<class T2> class TT2> struct A<TT2>;
+    // new-note@-1 {{template is declared here}}
 
     template<class... T3s> struct B;
     template struct A<B>;
+    // new-error@-1 {{explicit instantiation of undefined template}}
   } // namespace t1
   namespace t2 {
     template<template<class... T1s> class TT1> struct A;

From efcfa6e711689ada546c323316145ecd749d380a Mon Sep 17 00:00:00 2001
From: Mikhail Goncharov <goncharov.mikhail@gmail.com>
Date: Fri, 11 Oct 2024 14:47:38 +0200
Subject: [PATCH 167/177] Revert "Reland: [clang] Finish implementation of
 P0522 (#111711)"

See discussion in https://github.com/llvm/llvm-project/pull/111711

This reverts commit 6213aa5e58a7d32bdc82dd40322fb1bab83c4783.
---
 clang/docs/ReleaseNotes.rst                   |  10 -
 .../clang/Basic/DiagnosticSemaKinds.td        |   7 -
 clang/include/clang/Sema/Sema.h               |  14 +-
 clang/lib/Frontend/FrontendActions.cpp        |   2 -
 clang/lib/Sema/SemaTemplate.cpp               |  94 +++--
 clang/lib/Sema/SemaTemplateDeduction.cpp      | 353 +++++-------------
 clang/lib/Sema/SemaTemplateInstantiate.cpp    |  15 -
 .../temp/temp.arg/temp.arg.template/p3-0x.cpp |  31 +-
 clang/test/CXX/temp/temp.param/p12.cpp        |  21 +-
 clang/test/Modules/cxx-templates.cpp          |  15 +-
 clang/test/SemaCXX/make_integer_seq.cpp       |   5 +-
 clang/test/SemaTemplate/cwg2398.cpp           | 160 +-------
 clang/test/SemaTemplate/temp_arg_nontype.cpp  |  26 +-
 clang/test/SemaTemplate/temp_arg_template.cpp |  38 +-
 .../SemaTemplate/temp_arg_template_p0522.cpp  |  82 ++--
 .../Templight/templight-empty-entries-fix.cpp |  12 -
 .../templight-prior-template-arg.cpp          |  33 +-
 17 files changed, 266 insertions(+), 652 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 00376ce2a6008b..a7500196399622 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -177,10 +177,6 @@ C++23 Feature Support
 C++20 Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
 
-C++17 Feature Support
-^^^^^^^^^^^^^^^^^^^^^
-- The implementation of the relaxed template template argument matching rules is
-  more complete and reliable, and should provide more accurate diagnostics.
 
 Resolutions to C++ Defect Reports
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -338,10 +334,6 @@ Improvements to Clang's diagnostics
 
 - Clang now diagnoses when the result of a [[nodiscard]] function is discarded after being cast in C. Fixes #GH104391.
 
-- Clang now properly explains the reason a template template argument failed to
-  match a template template parameter, in terms of the C++17 relaxed matching rules
-  instead of the old ones.
-
 - Don't emit duplicated dangling diagnostics. (#GH93386).
 
 - Improved diagnostic when trying to befriend a concept. (#GH45182).
@@ -451,8 +443,6 @@ Bug Fixes to C++ Support
 - Correctly check constraints of explicit instantiations of member functions. (#GH46029)
 - When performing partial ordering of function templates, clang now checks that
   the deduction was consistent. Fixes (#GH18291).
-- Fixes to several issues in partial ordering of template template parameters, which
-  were documented in the test suite.
 - Fixed an assertion failure about a constraint of a friend function template references to a value with greater
   template depth than the friend function template. (#GH98258)
 - Clang now rebuilds the template parameters of out-of-line declarations and specializations in the context
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index f4a2d4a3f0656a..777ea1f37cea46 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -5262,13 +5262,6 @@ def note_template_arg_refers_here_func : Note<
 def err_template_arg_template_params_mismatch : Error<
   "template template argument has different template parameters than its "
   "corresponding template template parameter">;
-def note_template_arg_template_params_mismatch : Note<
-  "template template argument has different template parameters than its "
-  "corresponding template template parameter">;
-def err_non_deduced_mismatch : Error<
-  "could not match %diff{$ against $|types}0,1">;
-def err_inconsistent_deduction : Error<
-  "conflicting deduction %diff{$ against $|types}0,1 for parameter">;
 def err_template_arg_not_integral_or_enumeral : Error<
   "non-type template argument of type %0 must have an integral or enumeration"
   " type">;
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 49c593bf88c989..24191fd688dc5a 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -12418,9 +12418,8 @@ class Sema final : public SemaBase {
                                     sema::TemplateDeductionInfo &Info);
 
   bool isTemplateTemplateParameterAtLeastAsSpecializedAs(
-      TemplateParameterList *PParam, TemplateDecl *PArg, TemplateDecl *AArg,
-      const DefaultArguments &DefaultArgs, SourceLocation ArgLoc,
-      bool IsDeduced);
+      TemplateParameterList *PParam, TemplateDecl *AArg,
+      const DefaultArguments &DefaultArgs, SourceLocation Loc, bool IsDeduced);
 
   /// Mark which template parameters are used in a given expression.
   ///
@@ -12729,9 +12728,6 @@ class Sema final : public SemaBase {
 
       /// We are instantiating a type alias template declaration.
       TypeAliasTemplateInstantiation,
-
-      /// We are performing partial ordering for template template parameters.
-      PartialOrderingTTP,
     } Kind;
 
     /// Was the enclosing context a non-instantiation SFINAE context?
@@ -12953,12 +12949,6 @@ class Sema final : public SemaBase {
                           TemplateDecl *Entity, BuildingDeductionGuidesTag,
                           SourceRange InstantiationRange = SourceRange());
 
-    struct PartialOrderingTTP {};
-    /// \brief Note that we are partial ordering template template parameters.
-    InstantiatingTemplate(Sema &SemaRef, SourceLocation ArgLoc,
-                          PartialOrderingTTP, TemplateDecl *PArg,
-                          SourceRange InstantiationRange = SourceRange());
-
     /// Note that we have finished instantiating this template.
     void Clear();
 
diff --git a/clang/lib/Frontend/FrontendActions.cpp b/clang/lib/Frontend/FrontendActions.cpp
index e4b462b9b0fd81..64f90c493c1055 100644
--- a/clang/lib/Frontend/FrontendActions.cpp
+++ b/clang/lib/Frontend/FrontendActions.cpp
@@ -457,8 +457,6 @@ class DefaultTemplateInstCallback : public TemplateInstantiationCallback {
       return "BuildingDeductionGuides";
     case CodeSynthesisContext::TypeAliasTemplateInstantiation:
       return "TypeAliasTemplateInstantiation";
-    case CodeSynthesisContext::PartialOrderingTTP:
-      return "PartialOrderingTTP";
     }
     return "";
   }
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 226c1172a059d4..c7d48b81bc0347 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -5502,7 +5502,8 @@ bool Sema::CheckTemplateArgumentList(
         DefaultArgs && ParamIdx >= DefaultArgs.StartPos) {
       // All written arguments should have been consumed by this point.
       assert(ArgIdx == NumArgs && "bad default argument deduction");
-      if (ParamIdx == DefaultArgs.StartPos) {
+      // FIXME: Don't ignore parameter packs.
+      if (ParamIdx == DefaultArgs.StartPos && !(*Param)->isParameterPack()) {
         assert(Param + DefaultArgs.Args.size() <= ParamEnd);
         // Default arguments from a DeducedTemplateName are already converted.
         for (const TemplateArgument &DefArg : DefaultArgs.Args) {
@@ -5727,9 +5728,8 @@ bool Sema::CheckTemplateArgumentList(
   // pack expansions; they might be empty. This can happen even if
   // PartialTemplateArgs is false (the list of arguments is complete but
   // still dependent).
-  if (PartialOrderingTTP ||
-      (CurrentInstantiationScope &&
-       CurrentInstantiationScope->getPartiallySubstitutedPack())) {
+  if (ArgIdx < NumArgs && CurrentInstantiationScope &&
+      CurrentInstantiationScope->getPartiallySubstitutedPack()) {
     while (ArgIdx < NumArgs &&
            NewArgs[ArgIdx].getArgument().isPackExpansion()) {
       const TemplateArgument &Arg = NewArgs[ArgIdx++].getArgument();
@@ -7327,46 +7327,64 @@ bool Sema::CheckTemplateTemplateArgument(TemplateTemplateParmDecl *Param,
       << Template;
   }
 
-  if (!getLangOpts().RelaxedTemplateTemplateArgs)
-    return !TemplateParameterListsAreEqual(
-        Template->getTemplateParameters(), Params, /*Complain=*/true,
-        TPL_TemplateTemplateArgumentMatch, Arg.getLocation());
-
   // C++1z [temp.arg.template]p3: (DR 150)
   //   A template-argument matches a template template-parameter P when P
   //   is at least as specialized as the template-argument A.
-  if (!isTemplateTemplateParameterAtLeastAsSpecializedAs(
-          Params, Param, Template, DefaultArgs, Arg.getLocation(), IsDeduced))
-    return true;
-  // P2113
-  // C++20[temp.func.order]p2
-  //   [...] If both deductions succeed, the partial ordering selects the
-  // more constrained template (if one exists) as determined below.
-  SmallVector<const Expr *, 3> ParamsAC, TemplateAC;
-  Params->getAssociatedConstraints(ParamsAC);
-  // C++20[temp.arg.template]p3
-  //   [...] In this comparison, if P is unconstrained, the constraints on A
-  //   are not considered.
-  if (ParamsAC.empty())
-    return false;
+  if (getLangOpts().RelaxedTemplateTemplateArgs) {
+    // Quick check for the common case:
+    //   If P contains a parameter pack, then A [...] matches P if each of A's
+    //   template parameters matches the corresponding template parameter in
+    //   the template-parameter-list of P.
+    if (TemplateParameterListsAreEqual(
+            Template->getTemplateParameters(), Params, false,
+            TPL_TemplateTemplateArgumentMatch, Arg.getLocation()) &&
+        // If the argument has no associated constraints, then the parameter is
+        // definitely at least as specialized as the argument.
+        // Otherwise - we need a more thorough check.
+        !Template->hasAssociatedConstraints())
+      return false;
 
-  Template->getAssociatedConstraints(TemplateAC);
+    if (isTemplateTemplateParameterAtLeastAsSpecializedAs(
+            Params, Template, DefaultArgs, Arg.getLocation(), IsDeduced)) {
+      // P2113
+      // C++20[temp.func.order]p2
+      //   [...] If both deductions succeed, the partial ordering selects the
+      // more constrained template (if one exists) as determined below.
+      SmallVector<const Expr *, 3> ParamsAC, TemplateAC;
+      Params->getAssociatedConstraints(ParamsAC);
+      // C++2a[temp.arg.template]p3
+      //   [...] In this comparison, if P is unconstrained, the constraints on A
+      //   are not considered.
+      if (ParamsAC.empty())
+        return false;
 
-  bool IsParamAtLeastAsConstrained;
-  if (IsAtLeastAsConstrained(Param, ParamsAC, Template, TemplateAC,
-                             IsParamAtLeastAsConstrained))
-    return true;
-  if (!IsParamAtLeastAsConstrained) {
-    Diag(Arg.getLocation(),
-         diag::err_template_template_parameter_not_at_least_as_constrained)
-        << Template << Param << Arg.getSourceRange();
-    Diag(Param->getLocation(), diag::note_entity_declared_at) << Param;
-    Diag(Template->getLocation(), diag::note_entity_declared_at) << Template;
-    MaybeEmitAmbiguousAtomicConstraintsDiagnostic(Param, ParamsAC, Template,
-                                                  TemplateAC);
-    return true;
+      Template->getAssociatedConstraints(TemplateAC);
+
+      bool IsParamAtLeastAsConstrained;
+      if (IsAtLeastAsConstrained(Param, ParamsAC, Template, TemplateAC,
+                                 IsParamAtLeastAsConstrained))
+        return true;
+      if (!IsParamAtLeastAsConstrained) {
+        Diag(Arg.getLocation(),
+             diag::err_template_template_parameter_not_at_least_as_constrained)
+            << Template << Param << Arg.getSourceRange();
+        Diag(Param->getLocation(), diag::note_entity_declared_at) << Param;
+        Diag(Template->getLocation(), diag::note_entity_declared_at)
+            << Template;
+        MaybeEmitAmbiguousAtomicConstraintsDiagnostic(Param, ParamsAC, Template,
+                                                      TemplateAC);
+        return true;
+      }
+      return false;
+    }
+    // FIXME: Produce better diagnostics for deduction failures.
   }
-  return false;
+
+  return !TemplateParameterListsAreEqual(Template->getTemplateParameters(),
+                                         Params,
+                                         true,
+                                         TPL_TemplateTemplateArgumentMatch,
+                                         Arg.getLocation());
 }
 
 static Sema::SemaDiagnosticBuilder noteLocation(Sema &S, const NamedDecl &Decl,
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index 03ff8145e3b4ac..d106874c4c5bda 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -145,9 +145,7 @@ static TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
     PartialOrderingKind POK, bool DeducedFromArrayBound,
     bool *HasDeducedAnyParam);
 
-/// What directions packs are allowed to match non-packs.
-enum class PackFold { ParameterToArgument, ArgumentToParameter, Both };
-
+enum class PackFold { ParameterToArgument, ArgumentToParameter };
 static TemplateDeductionResult
 DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
                         ArrayRef<TemplateArgument> Ps,
@@ -1713,21 +1711,7 @@ static TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
     DeducedTemplateArgument Result =
         checkDeducedTemplateArguments(S.Context, Deduced[Index], NewDeduced);
     if (Result.isNull()) {
-      // We can also get inconsistencies when matching NTTP type.
-      switch (NamedDecl *Param = TemplateParams->getParam(Index);
-              Param->getKind()) {
-      case Decl::TemplateTypeParm:
-        Info.Param = cast<TemplateTypeParmDecl>(Param);
-        break;
-      case Decl::NonTypeTemplateParm:
-        Info.Param = cast<NonTypeTemplateParmDecl>(Param);
-        break;
-      case Decl::TemplateTemplateParm:
-        Info.Param = cast<TemplateTemplateParmDecl>(Param);
-        break;
-      default:
-        llvm_unreachable("unexpected kind");
-      }
+      Info.Param = cast<TemplateTypeParmDecl>(TemplateParams->getParam(Index));
       Info.FirstArg = Deduced[Index];
       Info.SecondArg = NewDeduced;
       return TemplateDeductionResult::Inconsistent;
@@ -2565,31 +2549,8 @@ DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
     if (const NonTypeTemplateParmDecl *NTTP =
             getDeducedParameterFromExpr(Info, P.getAsExpr())) {
       switch (A.getKind()) {
-      case TemplateArgument::Expression: {
-        const Expr *E = A.getAsExpr();
-        // When checking NTTP, if either the parameter or the argument is
-        // dependent, as there would be otherwise nothing to deduce, we force
-        // the argument to the parameter type using this dependent implicit
-        // cast, in order to maintain invariants. Now we can deduce the
-        // resulting type from the original type, and deduce the original type
-        // against the parameter we are checking.
-        if (const auto *ICE = dyn_cast<ImplicitCastExpr>(E);
-            ICE && ICE->getCastKind() == clang::CK_Dependent) {
-          E = ICE->getSubExpr();
-          if (auto Result = DeduceTemplateArgumentsByTypeMatch(
-                  S, TemplateParams, ICE->getType(), E->getType(), Info,
-                  Deduced, TDF_SkipNonDependent,
-                  PartialOrdering ? PartialOrderingKind::NonCall
-                                  : PartialOrderingKind::None,
-                  /*DeducedFromArrayBound=*/false, HasDeducedAnyParam);
-              Result != TemplateDeductionResult::Success)
-            return Result;
-        }
-        return DeduceNonTypeTemplateArgument(
-            S, TemplateParams, NTTP, DeducedTemplateArgument(A), E->getType(),
-            Info, PartialOrdering, Deduced, HasDeducedAnyParam);
-      }
       case TemplateArgument::Integral:
+      case TemplateArgument::Expression:
       case TemplateArgument::StructuralValue:
         return DeduceNonTypeTemplateArgument(
             S, TemplateParams, NTTP, DeducedTemplateArgument(A),
@@ -2678,75 +2639,50 @@ DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
                         SmallVectorImpl<DeducedTemplateArgument> &Deduced,
                         bool NumberOfArgumentsMustMatch, bool PartialOrdering,
                         PackFold PackFold, bool *HasDeducedAnyParam) {
-  bool FoldPackParameter = PackFold == PackFold::ParameterToArgument ||
-                           PackFold == PackFold::Both,
-       FoldPackArgument = PackFold == PackFold::ArgumentToParameter ||
-                          PackFold == PackFold::Both;
-
+  if (PackFold == PackFold::ArgumentToParameter)
+    std::swap(Ps, As);
   // C++0x [temp.deduct.type]p9:
   //   If the template argument list of P contains a pack expansion that is not
   //   the last template argument, the entire template argument list is a
   //   non-deduced context.
-  if (FoldPackParameter && hasPackExpansionBeforeEnd(Ps))
-    return TemplateDeductionResult::Success;
-
-  if (FoldPackArgument && hasPackExpansionBeforeEnd(As))
+  if (hasPackExpansionBeforeEnd(Ps))
     return TemplateDeductionResult::Success;
 
   // C++0x [temp.deduct.type]p9:
   //   If P has a form that contains <T> or <i>, then each argument Pi of the
   //   respective template argument list P is compared with the corresponding
   //   argument Ai of the corresponding template argument list of A.
-  for (unsigned ArgIdx = 0, ParamIdx = 0; /**/; /**/) {
-    if (!hasTemplateArgumentForDeduction(Ps, ParamIdx))
-      return !FoldPackParameter && hasTemplateArgumentForDeduction(As, ArgIdx)
-                 ? TemplateDeductionResult::MiscellaneousDeductionFailure
-                 : TemplateDeductionResult::Success;
-
-    if (!Ps[ParamIdx].isPackExpansion()) {
+  unsigned ArgIdx = 0, ParamIdx = 0;
+  for (; hasTemplateArgumentForDeduction(Ps, ParamIdx); ++ParamIdx) {
+    const TemplateArgument &P = Ps[ParamIdx];
+    if (!P.isPackExpansion()) {
       // The simple case: deduce template arguments by matching Pi and Ai.
 
       // Check whether we have enough arguments.
       if (!hasTemplateArgumentForDeduction(As, ArgIdx))
-        return !FoldPackArgument && NumberOfArgumentsMustMatch
+        return NumberOfArgumentsMustMatch
                    ? TemplateDeductionResult::MiscellaneousDeductionFailure
                    : TemplateDeductionResult::Success;
 
-      if (As[ArgIdx].isPackExpansion()) {
-        // C++1z [temp.deduct.type]p9:
-        //   During partial ordering, if Ai was originally a pack expansion
-        //   [and] Pi is not a pack expansion, template argument deduction
-        //   fails.
-        if (!FoldPackArgument)
-          return TemplateDeductionResult::MiscellaneousDeductionFailure;
-
-        TemplateArgument Pattern = As[ArgIdx].getPackExpansionPattern();
-        for (;;) {
-          // Deduce template parameters from the pattern.
-          if (auto Result = DeduceTemplateArguments(
-                  S, TemplateParams, Ps[ParamIdx], Pattern, Info,
-                  PartialOrdering, Deduced, HasDeducedAnyParam);
-              Result != TemplateDeductionResult::Success)
-            return Result;
+      // C++1z [temp.deduct.type]p9:
+      //   During partial ordering, if Ai was originally a pack expansion [and]
+      //   Pi is not a pack expansion, template argument deduction fails.
+      if (As[ArgIdx].isPackExpansion())
+        return TemplateDeductionResult::MiscellaneousDeductionFailure;
 
-          ++ParamIdx;
-          if (!hasTemplateArgumentForDeduction(Ps, ParamIdx))
-            return TemplateDeductionResult::Success;
-          if (Ps[ParamIdx].isPackExpansion())
-            break;
-        }
-      } else {
-        // Perform deduction for this Pi/Ai pair.
-        if (auto Result = DeduceTemplateArguments(
-                S, TemplateParams, Ps[ParamIdx], As[ArgIdx], Info,
-                PartialOrdering, Deduced, HasDeducedAnyParam);
-            Result != TemplateDeductionResult::Success)
-          return Result;
+      // Perform deduction for this Pi/Ai pair.
+      TemplateArgument Pi = P, Ai = As[ArgIdx];
+      if (PackFold == PackFold::ArgumentToParameter)
+        std::swap(Pi, Ai);
+      if (auto Result = DeduceTemplateArguments(S, TemplateParams, Pi, Ai, Info,
+                                                PartialOrdering, Deduced,
+                                                HasDeducedAnyParam);
+          Result != TemplateDeductionResult::Success)
+        return Result;
 
-        ++ArgIdx;
-        ++ParamIdx;
-        continue;
-      }
+      // Move to the next argument.
+      ++ArgIdx;
+      continue;
     }
 
     // The parameter is a pack expansion.
@@ -2756,7 +2692,7 @@ DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
     //   each remaining argument in the template argument list of A. Each
     //   comparison deduces template arguments for subsequent positions in the
     //   template parameter packs expanded by Pi.
-    TemplateArgument Pattern = Ps[ParamIdx].getPackExpansionPattern();
+    TemplateArgument Pattern = P.getPackExpansionPattern();
 
     // Prepare to deduce the packs within the pattern.
     PackDeductionScope PackScope(S, TemplateParams, Deduced, Info, Pattern);
@@ -2767,12 +2703,13 @@ DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
     for (; hasTemplateArgumentForDeduction(As, ArgIdx) &&
            PackScope.hasNextElement();
          ++ArgIdx) {
-      if (!FoldPackParameter && !As[ArgIdx].isPackExpansion())
-        return TemplateDeductionResult::MiscellaneousDeductionFailure;
+      TemplateArgument Pi = Pattern, Ai = As[ArgIdx];
+      if (PackFold == PackFold::ArgumentToParameter)
+        std::swap(Pi, Ai);
       // Deduce template arguments from the pattern.
-      if (auto Result = DeduceTemplateArguments(
-              S, TemplateParams, Pattern, As[ArgIdx], Info, PartialOrdering,
-              Deduced, HasDeducedAnyParam);
+      if (auto Result = DeduceTemplateArguments(S, TemplateParams, Pi, Ai, Info,
+                                                PartialOrdering, Deduced,
+                                                HasDeducedAnyParam);
           Result != TemplateDeductionResult::Success)
         return Result;
 
@@ -2781,8 +2718,12 @@ DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
 
     // Build argument packs for each of the parameter packs expanded by this
     // pack expansion.
-    return PackScope.finish();
+    if (auto Result = PackScope.finish();
+        Result != TemplateDeductionResult::Success)
+      return Result;
   }
+
+  return TemplateDeductionResult::Success;
 }
 
 TemplateDeductionResult Sema::DeduceTemplateArguments(
@@ -3361,6 +3302,7 @@ static TemplateDeductionResult FinishTemplateArgumentDeduction(
   // Unevaluated SFINAE context.
   EnterExpressionEvaluationContext Unevaluated(
       S, Sema::ExpressionEvaluationContext::Unevaluated);
+  Sema::SFINAETrap Trap(S);
 
   Sema::ContextRAII SavedContext(S, getAsDeclContextOrEnclosing(Template));
 
@@ -3378,36 +3320,20 @@ static TemplateDeductionResult FinishTemplateArgumentDeduction(
 
   // Check that we produced the correct argument list.
   TemplateParameterList *TemplateParams = Template->getTemplateParameters();
-  auto isSame = [&](unsigned I, const TemplateArgument &P,
-                    const TemplateArgument &A) {
-    if (isSameTemplateArg(S.Context, P, A, PartialOrdering,
-                          /*PackExpansionMatchesPack=*/true))
-      return true;
-    Info.Param = makeTemplateParameter(TemplateParams->getParam(I));
-    Info.FirstArg = P;
-    Info.SecondArg = A;
-    return false;
-  };
   for (unsigned I = 0, E = TemplateParams->size(); I != E; ++I) {
-    const TemplateArgument &P = TemplateArgs[I];
-    if (P.isPackExpansion()) {
-      assert(I == TemplateArgs.size() - 1);
-      for (/**/; I != E; ++I) {
-        const TemplateArgument &A = CanonicalBuilder[I];
-        if (A.getKind() == TemplateArgument::Pack) {
-          for (const TemplateArgument &Ai : A.getPackAsArray())
-            if (!isSame(I, P, Ai))
-              return TemplateDeductionResult::NonDeducedMismatch;
-        } else if (!isSame(I, P, A)) {
-          return TemplateDeductionResult::NonDeducedMismatch;
-        }
-      }
-      break;
-    }
-    if (!isSame(I, P, CanonicalBuilder[I]))
+    TemplateArgument InstArg = CanonicalBuilder[I];
+    if (!isSameTemplateArg(S.Context, TemplateArgs[I], InstArg, PartialOrdering,
+                           /*PackExpansionMatchesPack=*/true)) {
+      Info.Param = makeTemplateParameter(TemplateParams->getParam(I));
+      Info.FirstArg = TemplateArgs[I];
+      Info.SecondArg = InstArg;
       return TemplateDeductionResult::NonDeducedMismatch;
+    }
   }
 
+  if (Trap.hasErrorOccurred())
+    return TemplateDeductionResult::SubstitutionFailure;
+
   if (!PartialOrdering) {
     if (auto Result = CheckDeducedArgumentConstraints(
             S, Template, SugaredBuilder, CanonicalBuilder, Info);
@@ -3428,6 +3354,7 @@ static TemplateDeductionResult FinishTemplateArgumentDeduction(
   // Unevaluated SFINAE context.
   EnterExpressionEvaluationContext Unevaluated(
       S, Sema::ExpressionEvaluationContext::Unevaluated);
+  Sema::SFINAETrap Trap(S);
 
   Sema::ContextRAII SavedContext(S, getAsDeclContextOrEnclosing(TD));
 
@@ -3436,13 +3363,20 @@ static TemplateDeductionResult FinishTemplateArgumentDeduction(
   //   explicitly specified, template argument deduction fails.
   SmallVector<TemplateArgument, 4> SugaredBuilder, CanonicalBuilder;
   if (auto Result = ConvertDeducedTemplateArguments(
-          S, TD, /*IsDeduced=*/false, Deduced, Info, SugaredBuilder,
+          S, TD, /*IsPartialOrdering=*/false, Deduced, Info, SugaredBuilder,
           CanonicalBuilder);
       Result != TemplateDeductionResult::Success)
     return Result;
 
-  return ::CheckDeducedArgumentConstraints(S, TD, SugaredBuilder,
-                                           CanonicalBuilder, Info);
+  if (Trap.hasErrorOccurred())
+    return TemplateDeductionResult::SubstitutionFailure;
+
+  if (auto Result = CheckDeducedArgumentConstraints(S, TD, SugaredBuilder,
+                                                    CanonicalBuilder, Info);
+      Result != TemplateDeductionResult::Success)
+    return Result;
+
+  return TemplateDeductionResult::Success;
 }
 
 /// Perform template argument deduction to determine whether the given template
@@ -3489,20 +3423,16 @@ DeduceTemplateArguments(Sema &S, T *Partial,
   if (Inst.isInvalid())
     return TemplateDeductionResult::InstantiationDepth;
 
+  if (Trap.hasErrorOccurred())
+    return TemplateDeductionResult::SubstitutionFailure;
+
   TemplateDeductionResult Result;
   S.runWithSufficientStackSpace(Info.getLocation(), [&] {
     Result = ::FinishTemplateArgumentDeduction(S, Partial,
                                                /*IsPartialOrdering=*/false,
                                                TemplateArgs, Deduced, Info);
   });
-
-  if (Result != TemplateDeductionResult::Success)
-    return Result;
-
-  if (Trap.hasErrorOccurred())
-    return TemplateDeductionResult::SubstitutionFailure;
-
-  return TemplateDeductionResult::Success;
+  return Result;
 }
 
 TemplateDeductionResult
@@ -3558,18 +3488,14 @@ Sema::DeduceTemplateArgumentsFromType(TemplateDecl *TD, QualType FromType,
   if (Inst.isInvalid())
     return TemplateDeductionResult::InstantiationDepth;
 
+  if (Trap.hasErrorOccurred())
+    return TemplateDeductionResult::SubstitutionFailure;
+
   TemplateDeductionResult Result;
   runWithSufficientStackSpace(Info.getLocation(), [&] {
     Result = ::FinishTemplateArgumentDeduction(*this, TD, Deduced, Info);
   });
-
-  if (Result != TemplateDeductionResult::Success)
-    return Result;
-
-  if (Trap.hasErrorOccurred())
-    return TemplateDeductionResult::SubstitutionFailure;
-
-  return TemplateDeductionResult::Success;
+  return Result;
 }
 
 /// Determine whether the given type T is a simple-template-id type.
@@ -6210,23 +6136,14 @@ static bool isAtLeastAsSpecializedAs(Sema &S, QualType T1, QualType T2,
     return false;
 
   const auto *TST1 = cast<TemplateSpecializationType>(T1);
-
-  Sema::SFINAETrap Trap(S);
-
-  TemplateDeductionResult Result;
+  bool AtLeastAsSpecialized;
   S.runWithSufficientStackSpace(Info.getLocation(), [&] {
-    Result = ::FinishTemplateArgumentDeduction(
-        S, P2, /*IsPartialOrdering=*/true, TST1->template_arguments(), Deduced,
-        Info);
+    AtLeastAsSpecialized =
+        FinishTemplateArgumentDeduction(
+            S, P2, /*IsPartialOrdering=*/true, TST1->template_arguments(),
+            Deduced, Info) == TemplateDeductionResult::Success;
   });
-
-  if (Result != TemplateDeductionResult::Success)
-    return false;
-
-  if (Trap.hasErrorOccurred())
-    return false;
-
-  return true;
+  return AtLeastAsSpecialized;
 }
 
 namespace {
@@ -6464,9 +6381,8 @@ bool Sema::isMoreSpecializedThanPrimary(
 }
 
 bool Sema::isTemplateTemplateParameterAtLeastAsSpecializedAs(
-    TemplateParameterList *P, TemplateDecl *PArg, TemplateDecl *AArg,
-    const DefaultArguments &DefaultArgs, SourceLocation ArgLoc,
-    bool IsDeduced) {
+    TemplateParameterList *P, TemplateDecl *AArg,
+    const DefaultArguments &DefaultArgs, SourceLocation Loc, bool IsDeduced) {
   // C++1z [temp.arg.template]p4: (DR 150)
   //   A template template-parameter P is at least as specialized as a
   //   template template-argument A if, given the following rewrite to two
@@ -6478,12 +6394,6 @@ bool Sema::isTemplateTemplateParameterAtLeastAsSpecializedAs(
   //
   TemplateParameterList *A = AArg->getTemplateParameters();
 
-  Sema::InstantiatingTemplate Inst(
-      *this, ArgLoc, Sema::InstantiatingTemplate::PartialOrderingTTP(), PArg,
-      SourceRange(P->getTemplateLoc(), P->getRAngleLoc()));
-  if (Inst.isInvalid())
-    return false;
-
   //   Given an invented class template X with the template parameter list of
   //   A (including default arguments):
   //    - Each function template has a single function parameter whose type is
@@ -6498,6 +6408,8 @@ bool Sema::isTemplateTemplateParameterAtLeastAsSpecializedAs(
   // templates.
   SmallVector<TemplateArgument, 4> PArgs;
   {
+    SFINAETrap Trap(*this);
+
     Context.getInjectedTemplateArgs(P, PArgs);
     TemplateArgumentListInfo PArgList(P->getLAngleLoc(),
                                       P->getRAngleLoc());
@@ -6517,17 +6429,18 @@ bool Sema::isTemplateTemplateParameterAtLeastAsSpecializedAs(
     // C++1z [temp.arg.template]p3:
     //   If the rewrite produces an invalid type, then P is not at least as
     //   specialized as A.
-    SmallVector<TemplateArgument, 4> CanonicalPArgs;
-    if (CheckTemplateArgumentList(AArg, ArgLoc, PArgList, DefaultArgs, false,
-                                  PArgs, CanonicalPArgs,
+    SmallVector<TemplateArgument, 4> SugaredPArgs;
+    if (CheckTemplateArgumentList(AArg, Loc, PArgList, DefaultArgs, false,
+                                  SugaredPArgs, PArgs,
                                   /*UpdateArgsWithConversions=*/true,
                                   /*ConstraintsNotSatisfied=*/nullptr,
-                                  /*PartialOrderingTTP=*/true))
+                                  /*PartialOrderTTP=*/true) ||
+        Trap.hasErrorOccurred())
       return false;
   }
 
   // Determine whether P1 is at least as specialized as P2.
-  TemplateDeductionInfo Info(ArgLoc, A->getDepth());
+  TemplateDeductionInfo Info(Loc, A->getDepth());
   SmallVector<DeducedTemplateArgument, 4> Deduced;
   Deduced.resize(A->size());
 
@@ -6542,89 +6455,29 @@ bool Sema::isTemplateTemplateParameterAtLeastAsSpecializedAs(
   //   be inverted between Ps and As. On non-deduced context, matching needs to
   //   happen both ways, according to [temp.arg.template]p3, but this is
   //   currently implemented as a special case elsewhere.
-  switch (::DeduceTemplateArguments(
-      *this, A, AArgs, PArgs, Info, Deduced,
-      /*NumberOfArgumentsMustMatch=*/false, /*PartialOrdering=*/true,
-      IsDeduced ? PackFold::ArgumentToParameter : PackFold::Both,
-      /*HasDeducedAnyParam=*/nullptr)) {
-  case clang::TemplateDeductionResult::Success:
-    break;
-
-  case TemplateDeductionResult::MiscellaneousDeductionFailure:
-    Diag(AArg->getLocation(), diag::err_template_param_list_different_arity)
-        << (A->size() > P->size()) << /*isTemplateTemplateParameter=*/true
-        << SourceRange(A->getTemplateLoc(), P->getRAngleLoc());
-    return false;
-  case TemplateDeductionResult::NonDeducedMismatch:
-    Diag(AArg->getLocation(), diag::err_non_deduced_mismatch)
-        << Info.FirstArg << Info.SecondArg;
+  if (::DeduceTemplateArguments(*this, A, AArgs, PArgs, Info, Deduced,
+                                /*NumberOfArgumentsMustMatch=*/false,
+                                /*PartialOrdering=*/true,
+                                IsDeduced ? PackFold::ArgumentToParameter
+                                          : PackFold::ParameterToArgument,
+                                /*HasDeducedAnyParam=*/nullptr) !=
+      TemplateDeductionResult::Success)
     return false;
-  case TemplateDeductionResult::Inconsistent:
-    Diag(getAsNamedDecl(Info.Param)->getLocation(),
-         diag::err_inconsistent_deduction)
-        << Info.FirstArg << Info.SecondArg;
-    return false;
-  case TemplateDeductionResult::AlreadyDiagnosed:
-    return false;
-
-  // None of these should happen for a plain deduction.
-  case TemplateDeductionResult::Invalid:
-  case TemplateDeductionResult::InstantiationDepth:
-  case TemplateDeductionResult::Incomplete:
-  case TemplateDeductionResult::IncompletePack:
-  case TemplateDeductionResult::Underqualified:
-  case TemplateDeductionResult::SubstitutionFailure:
-  case TemplateDeductionResult::DeducedMismatch:
-  case TemplateDeductionResult::DeducedMismatchNested:
-  case TemplateDeductionResult::TooManyArguments:
-  case TemplateDeductionResult::TooFewArguments:
-  case TemplateDeductionResult::InvalidExplicitArguments:
-  case TemplateDeductionResult::NonDependentConversionFailure:
-  case TemplateDeductionResult::ConstraintsNotSatisfied:
-  case TemplateDeductionResult::CUDATargetMismatch:
-    llvm_unreachable("Unexpected Result");
-  }
 
   SmallVector<TemplateArgument, 4> DeducedArgs(Deduced.begin(), Deduced.end());
+  Sema::InstantiatingTemplate Inst(*this, Info.getLocation(), AArg, DeducedArgs,
+                                   Info);
+  if (Inst.isInvalid())
+    return false;
 
-  TemplateDeductionResult TDK;
+  bool AtLeastAsSpecialized;
   runWithSufficientStackSpace(Info.getLocation(), [&] {
-    TDK = ::FinishTemplateArgumentDeduction(
-        *this, AArg, /*IsPartialOrdering=*/true, PArgs, Deduced, Info);
+    AtLeastAsSpecialized =
+        ::FinishTemplateArgumentDeduction(
+            *this, AArg, /*IsPartialOrdering=*/true, PArgs, Deduced, Info) ==
+        TemplateDeductionResult::Success;
   });
-  switch (TDK) {
-  case TemplateDeductionResult::Success:
-    return true;
-
-  // It doesn't seem possible to get a non-deduced mismatch when partial
-  // ordering TTPs.
-  case TemplateDeductionResult::NonDeducedMismatch:
-    llvm_unreachable("Unexpected NonDeducedMismatch");
-
-  // Substitution failures should have already been diagnosed.
-  case TemplateDeductionResult::AlreadyDiagnosed:
-  case TemplateDeductionResult::SubstitutionFailure:
-  case TemplateDeductionResult::InstantiationDepth:
-    return false;
-
-  // None of these should happen when just converting deduced arguments.
-  case TemplateDeductionResult::Invalid:
-  case TemplateDeductionResult::Incomplete:
-  case TemplateDeductionResult::IncompletePack:
-  case TemplateDeductionResult::Inconsistent:
-  case TemplateDeductionResult::Underqualified:
-  case TemplateDeductionResult::DeducedMismatch:
-  case TemplateDeductionResult::DeducedMismatchNested:
-  case TemplateDeductionResult::TooManyArguments:
-  case TemplateDeductionResult::TooFewArguments:
-  case TemplateDeductionResult::InvalidExplicitArguments:
-  case TemplateDeductionResult::NonDependentConversionFailure:
-  case TemplateDeductionResult::ConstraintsNotSatisfied:
-  case TemplateDeductionResult::MiscellaneousDeductionFailure:
-  case TemplateDeductionResult::CUDATargetMismatch:
-    llvm_unreachable("Unexpected Result");
-  }
-  llvm_unreachable("Unexpected TDK");
+  return AtLeastAsSpecialized;
 }
 
 namespace {
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index de71774bd8e559..2263b76520ca25 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -572,7 +572,6 @@ bool Sema::CodeSynthesisContext::isInstantiationRecord() const {
   case LambdaExpressionSubstitution:
   case BuildingDeductionGuides:
   case TypeAliasTemplateInstantiation:
-  case PartialOrderingTTP:
     return false;
 
   // This function should never be called when Kind's value is Memoization.
@@ -805,11 +804,6 @@ Sema::InstantiatingTemplate::InstantiatingTemplate(
           SemaRef, CodeSynthesisContext::BuildingDeductionGuides,
           PointOfInstantiation, InstantiationRange, Entity) {}
 
-Sema::InstantiatingTemplate::InstantiatingTemplate(
-    Sema &SemaRef, SourceLocation ArgLoc, PartialOrderingTTP,
-    TemplateDecl *PArg, SourceRange InstantiationRange)
-    : InstantiatingTemplate(SemaRef, CodeSynthesisContext::PartialOrderingTTP,
-                            ArgLoc, InstantiationRange, PArg) {}
 
 void Sema::pushCodeSynthesisContext(CodeSynthesisContext Ctx) {
   Ctx.SavedInNonInstantiationSFINAEContext = InNonInstantiationSFINAEContext;
@@ -1249,14 +1243,6 @@ void Sema::PrintInstantiationStack() {
           << cast<TypeAliasTemplateDecl>(Active->Entity)
           << Active->InstantiationRange;
       break;
-    case CodeSynthesisContext::PartialOrderingTTP:
-      Diags.Report(Active->PointOfInstantiation,
-                   diag::note_template_arg_template_params_mismatch);
-      if (SourceLocation ParamLoc = Active->Entity->getLocation();
-          ParamLoc.isValid())
-        Diags.Report(ParamLoc, diag::note_template_prev_declaration)
-            << /*isTemplateTemplateParam=*/true << Active->InstantiationRange;
-      break;
     }
   }
 }
@@ -1299,7 +1285,6 @@ std::optional<TemplateDeductionInfo *> Sema::isSFINAEContext() const {
     case CodeSynthesisContext::PriorTemplateArgumentSubstitution:
     case CodeSynthesisContext::DefaultTemplateArgumentChecking:
     case CodeSynthesisContext::RewritingOperatorAsSpaceship:
-    case CodeSynthesisContext::PartialOrderingTTP:
       // A default template argument instantiation and substitution into
       // template parameters with arguments for prior parameters may or may
       // not be a SFINAE context; look further up the stack.
diff --git a/clang/test/CXX/temp/temp.arg/temp.arg.template/p3-0x.cpp b/clang/test/CXX/temp/temp.arg/temp.arg.template/p3-0x.cpp
index 1bbbd1d3429ddd..19793fe8263726 100644
--- a/clang/test/CXX/temp/temp.arg/temp.arg.template/p3-0x.cpp
+++ b/clang/test/CXX/temp/temp.arg/temp.arg.template/p3-0x.cpp
@@ -2,13 +2,13 @@
 
 template <class T> struct eval; // expected-note 3{{template is declared here}}
 
-template <template <class, class...> class TT, class T1, class... Rest>
+template <template <class, class...> class TT, class T1, class... Rest> 
 struct eval<TT<T1, Rest...>> { };
 
-template <class T1> struct A;
-template <class T1, class T2> struct B;
-template <int N> struct C;
-template <class T1, int N> struct D;
+template <class T1> struct A; 
+template <class T1, class T2> struct B; 
+template <int N> struct C; 
+template <class T1, int N> struct D; 
 template <class T1, class T2, int N = 17> struct E;
 
 eval<A<int>> eA;
@@ -17,32 +17,27 @@ eval<C<17>> eC; // expected-error{{implicit instantiation of undefined template
 eval<D<int, 17>> eD; // expected-error{{implicit instantiation of undefined template 'eval<D<int, 17>>'}}
 eval<E<int, float>> eE; // expected-error{{implicit instantiation of undefined template 'eval<E<int, float>>}}
 
-template<
-  template <int ...N> // expected-error{{deduced non-type template argument does not have the same type as the corresponding template parameter ('int' vs 'long')}}
-  class TT // expected-note {{previous template template parameter is here}}
-> struct X0 { };
-
+template<template <int ...N> class TT> struct X0 { }; // expected-note{{previous non-type template parameter with type 'int' is here}}
 template<int I, int J, int ...Rest> struct X0a;
 template<int ...Rest> struct X0b;
-template<int I, long J> struct X0c; // expected-note{{template parameter is declared here}}
+template<int I, long J> struct X0c; // expected-note{{template non-type parameter has a different type 'long' in template argument}}
 
 X0<X0a> inst_x0a;
 X0<X0b> inst_x0b;
-X0<X0c> inst_x0c; // expected-note{{template template argument has different template parameters than its corresponding template template parameter}}
+X0<X0c> inst_x0c; // expected-error{{template template argument has different template parameters than its corresponding template template parameter}}
 
-template<typename T,
-         template <T ...N> // expected-error{{deduced non-type template argument does not have the same type as the corresponding template parameter ('short' vs 'long')}}
-         class TT // expected-note {{previous template template parameter is here}}
-> struct X1 { };
+template<typename T, 
+         template <T ...N> class TT>  // expected-note{{previous non-type template parameter with type 'short' is here}}
+struct X1 { };
 template<int I, int J, int ...Rest> struct X1a;
 template<long I, long ...Rest> struct X1b;
 template<short I, short J> struct X1c;
-template<short I, long J> struct X1d; // expected-note{{template parameter is declared here}}
+template<short I, long J> struct X1d; // expected-note{{template non-type parameter has a different type 'long' in template argument}}
 
 X1<int, X1a> inst_x1a;
 X1<long, X1b> inst_x1b;
 X1<short, X1c> inst_x1c;
-X1<short, X1d> inst_x1d; // expected-note{{template template argument has different template parameters than its corresponding template template parameter}}
+X1<short, X1d> inst_x1d; // expected-error{{template template argument has different template parameters than its corresponding template template paramete}}
 
 template <int> class X2; // expected-note{{template is declared here}} \
                          // expected-note{{template is declared here}}
diff --git a/clang/test/CXX/temp/temp.param/p12.cpp b/clang/test/CXX/temp/temp.param/p12.cpp
index 8317e7f24152cc..7be38790905fa3 100644
--- a/clang/test/CXX/temp/temp.param/p12.cpp
+++ b/clang/test/CXX/temp/temp.param/p12.cpp
@@ -1,40 +1,39 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
-template<typename> struct Y1; // expected-note{{template is declared here}}
+// RUN: %clang_cc1 -fsyntax-only -verify %s 
+template<typename> struct Y1; // expected-note{{too few template parameters in template template argument}}
 template<typename, int> struct Y2;
 
 // C++ [temp.param]p12:
-template<class T1,
+template<class T1, 
          class T2 = int> // expected-note{{previous default template argument defined here}}
   class B3;
 template<class T1, typename T2> class B3;
-template<class T1,
+template<class T1, 
          typename T2 = float> // expected-error{{template parameter redefines default argument}}
   class B3;
 
-template<template<class, int> class,
+template<template<class, int> class, 
          template<class> class = Y1> // expected-note{{previous default template argument defined here}}
   class B3t;
 
 template<template<class, int> class, template<class> class> class B3t;
 
-template<template<class, int> class,
+template<template<class, int> class, 
          template<class> class = Y1> // expected-error{{template parameter redefines default argument}}
   class B3t;
 
-template<int N,
+template<int N, 
          int M = 5> // expected-note{{previous default template argument defined here}}
   class B3n;
 
 template<int N, int M> class B3n;
 
-template<int N,
+template<int N, 
          int M = 7>  // expected-error{{template parameter redefines default argument}}
   class B3n;
 
 // Check validity of default arguments
-template<template<class, int> class =// expected-note {{previous template template parameter is here}}
-           Y1> // expected-error{{too many template arguments for class template 'Y1'}}
-               // expected-note@-1 {{template template argument has different template parameters than its corresponding template template parameter}}
+template<template<class, int> class // expected-note{{previous template template parameter is here}}
+           = Y1> // expected-error{{template template argument has different template parameters than its corresponding template template parameter}}
   class C1 {};
 
 C1<> c1; // expected-note{{while checking a default template argument}}
diff --git a/clang/test/Modules/cxx-templates.cpp b/clang/test/Modules/cxx-templates.cpp
index 953fc0d9e867c4..b7d5741e69af61 100644
--- a/clang/test/Modules/cxx-templates.cpp
+++ b/clang/test/Modules/cxx-templates.cpp
@@ -40,14 +40,23 @@ void g() {
 
   template_param_kinds_1<0>(); // ok, from cxx-templates-a.h
   template_param_kinds_1<int>(); // ok, from cxx-templates-b.h
-  template_param_kinds_2<Tmpl_T_C>(); // ok, from cxx-templates-b.h
+
+  template_param_kinds_2<Tmpl_T_C>(); // expected-error {{no matching function}}
+  // expected-note@Inputs/cxx-templates-a.h:11 {{invalid explicitly-specified argument}}
+  // expected-note@Inputs/cxx-templates-b.h:11 {{invalid explicitly-specified argument}}
 
   template_param_kinds_2<Tmpl_T_I_I>(); // expected-error {{ambiguous}}
   // expected-note@Inputs/cxx-templates-a.h:11 {{candidate}}
   // expected-note@Inputs/cxx-templates-b.h:11 {{candidate}}
 
-  template_param_kinds_3<Tmpl_T_T_A>();
-  template_param_kinds_3<Tmpl_T_T_B>();
+  // FIXME: This should be valid, but we incorrectly match the template template
+  // argument against both template template parameters.
+  template_param_kinds_3<Tmpl_T_T_A>(); // expected-error {{ambiguous}}
+  // expected-note@Inputs/cxx-templates-a.h:12 {{candidate}}
+  // expected-note@Inputs/cxx-templates-b.h:12 {{candidate}}
+  template_param_kinds_3<Tmpl_T_T_B>(); // expected-error {{ambiguous}}
+  // expected-note@Inputs/cxx-templates-a.h:12 {{candidate}}
+  // expected-note@Inputs/cxx-templates-b.h:12 {{candidate}}
 
   // Trigger the instantiation of a template in 'a' that uses a type defined in
   // 'common'. That type is not visible here.
diff --git a/clang/test/SemaCXX/make_integer_seq.cpp b/clang/test/SemaCXX/make_integer_seq.cpp
index 71b7b8260d4abc..8f72ce15eef476 100644
--- a/clang/test/SemaCXX/make_integer_seq.cpp
+++ b/clang/test/SemaCXX/make_integer_seq.cpp
@@ -48,5 +48,6 @@ using illformed2 = ErrorSeq<int, -5>; // expected-note{{in instantiation}}
 template <typename T, T N> void f() {}
 __make_integer_seq<f, int, 0> x; // expected-error{{template template parameter must be a class template or type alias template}}
 
-__make_integer_seq<__make_integer_seq, int, 10> PR28494; // expected-note{{different template parameters}}
-// expected-error@make_integer_seq.cpp:* {{template argument for template template parameter must be a class template or type alias template}}
+__make_integer_seq<__make_integer_seq, int, 10> PR28494; // expected-error{{different template parameters}}
+// expected-note@make_integer_seq.cpp:* {{template parameter has a different kind}}
+// expected-note@make_integer_seq.cpp:* {{previous template template parameter is here}}
diff --git a/clang/test/SemaTemplate/cwg2398.cpp b/clang/test/SemaTemplate/cwg2398.cpp
index b9e9e9f0c97f26..6dc7af6ea17899 100644
--- a/clang/test/SemaTemplate/cwg2398.cpp
+++ b/clang/test/SemaTemplate/cwg2398.cpp
@@ -106,10 +106,12 @@ namespace type_pack3 {
   template<class T3> struct B;
 
   template<template<class T4              > class TT1, class T5              > struct B<TT1<T5        >>;
-
-  template<template<class T6, class ...T7s> class TT2, class T8, class ...T9s> struct B<TT2<T8, T9s...>> {};
+  // new-note@-1 {{template is declared here}}
+  template<template<class T6, class ...T7s> class TT2, class T8, class ...T9s> struct B<TT2<T8, T9s...>>;
+  // old-note@-1 {{template is declared here}}
 
   template struct B<A<int>>;
+  // expected-error@-1 {{explicit instantiation of undefined template}}
 } // namespace type_pack3
 
 namespace gcc_issue {
@@ -362,73 +364,6 @@ namespace classes {
   } // namespace defaulted
 } // namespace classes
 
-namespace packs {
-  namespace t1 {
-    // FIXME: This should be rejected
-    template<template<int, int...> class> struct A {};
-    // old-note@-1 {{previous non-type template parameter with type 'int' is here}}
-
-    template<char> struct B;
-    // old-note@-1 {{template non-type parameter has a different type 'char' in template argument}}
-
-    template struct A<B>;
-    // old-error@-1 {{has different template parameters}}
-  } // namespace t1
-  namespace t2 {
-    template<template<char, int...> class> struct A {};
-    // old-note@-1 {{previous non-type template parameter with type 'char' is here}}
-
-    template<int> struct B;
-    // old-note@-1 {{template non-type parameter has a different type 'int' in template argument}}
-
-    template struct A<B>;
-    // old-error@-1 {{has different template parameters}}
-  } // namespace t2
-  namespace t3 {
-    // FIXME: This should be rejected
-    template<template<int...> class> struct A {};
-    // old-note@-1 {{previous non-type template parameter with type 'int' is here}}
-
-    template<char> struct B;
-    // old-note@-1 {{template non-type parameter has a different type 'char' in template argument}}
-
-    template struct A<B>;
-    // old-error@-1 {{has different template parameters}}
-  } // namespace t3
-  namespace t4 {
-    template<template<char...> class> struct A {};
-    // old-note@-1 {{previous non-type template parameter with type 'char' is here}}
-
-    template<int> struct B;
-    // old-note@-1 {{template non-type parameter has a different type 'int' in template argument}}
-
-    template struct A<B>;
-    // old-error@-1 {{has different template parameters}}
-  } // namespace t4
-} // namespace packs
-
-namespace partial {
-  namespace t1 {
-    template<template<class... T1s> class TT1> struct A {};
-
-    template<template<class T2> class TT2> struct A<TT2>;
-    // new-note@-1 {{template is declared here}}
-
-    template<class... T3s> struct B;
-    template struct A<B>;
-    // new-error@-1 {{explicit instantiation of undefined template}}
-  } // namespace t1
-  namespace t2 {
-    template<template<class... T1s> class TT1> struct A;
-
-    template<template<class T2> class TT2> struct A<TT2> {};
-
-    template<class T3> struct B;
-    template struct A<B>;
-  } // namespace t1
-
-} // namespace partial
-
 namespace regression1 {
   template <typename T, typename Y> struct map {};
   template <typename T> class foo {};
@@ -445,93 +380,6 @@ namespace regression1 {
   }
 } // namespace regression1
 
-namespace constraints {
-  template <class T> concept C1 = true;
-  // new-note@-1 {{similar constraint expression here}}
-  // new-note@-2 2{{similar constraint expressions not considered equivalent}}
-
-  template <class T> concept C2 = C1<T> && true;
-  // new-note@-1 2{{similar constraint expression here}}
-
-  template <class T> concept D1 = true;
-  // new-note@-1 {{similar constraint expressions not considered equivalent}}
-
-  namespace t1 {
-    template<template<C1, class... T1s> class TT1> // new-note {{TT1' declared here}}
-    struct A {};
-    template<D1, class T2> struct B {}; // new-note {{'B' declared here}}
-    template struct A<B>;
-    // new-error@-1 {{'B' is more constrained than template template parameter 'TT1'}}
-  } // namespace t1
-  namespace t2 {
-    template<template<C2, class... T1s> class TT1> struct A {};
-    template<C1, class T2> struct B {};
-    template struct A<B>;
-  } // namespace t2
-  namespace t3 {
-    template<template<C1, class... T1s> class TT1> // new-note {{'TT1' declared here}}
-    struct A {};
-    template<C2, class T2> struct B {}; // new-note {{'B' declared here}}
-    template struct A<B>;
-    // new-error@-1 {{'B' is more constrained than template template parameter 'TT1'}}
-  } // namespace t2
-  namespace t4 {
-    // FIXME: This should be accepted.
-    template<template<C1... T1s> class TT1> // new-note {{'TT1' declared here}}
-    struct A {};
-    template<C1 T2> struct B {}; // new-note {{'B' declared here}}
-    template struct A<B>;
-    // new-error@-1 {{'B' is more constrained than template template parameter 'TT1'}}
-  } // namespace t4
-  namespace t5 {
-    // FIXME: This should be accepted
-    template<template<C2... T1s> class TT1> // new-note {{'TT1' declared here}}
-    struct A {};
-    template<C1 T2> struct B {}; // new-note {{'B' declared here}}
-    template struct A<B>;
-    // new-error@-1 {{'B' is more constrained than template template parameter 'TT1'}}
-  } // namespace t5
-  namespace t6 {
-    template<template<C1... T1s> class TT1> // new-note {{'TT1' declared here}}
-    struct A {};
-    template<C2 T2> struct B {}; // new-note {{'B' declared here}}
-    template struct A<B>;
-    // new-error@-1 {{'B' is more constrained than template template parameter 'TT1'}}
-  } // namespace t6
-  namespace t7 {
-    template<template<class... T1s> class TT1>
-    struct A {};
-    template<C1 T2> struct B {};
-    template struct A<B>;
-  } // namespace t7
-  namespace t8 {
-    template<template<C1... T1s> class TT1>
-    struct A {};
-    template<class T2> struct B {};
-    template struct A<B>;
-  } // namespace t8
-  namespace t9 {
-    template<template<C1... T1s> class TT1> // new-note {{'TT1' declared here}}
-    struct A {};
-    template<D1 T2> struct B {}; // new-note {{'B' declared here}}
-    template struct A<B>;
-    // new-error@-1 {{'B' is more constrained than template template parameter 'TT1'}}
-  } // namespace t9
-  namespace t10 {
-    template<template<class...> requires C1<int> class TT1> // new-note {{'TT1' declared here}}
-    struct A {};
-
-    template<class> requires C2<int> struct B {}; // new-note {{'B' declared here}}
-    template struct A<B>;
-    // new-error@-1 {{'B' is more constrained than template template parameter 'TT1'}}
-  } // namespace t10
-  namespace t11 {
-    template<template<class...> requires C2<int> class TT1> struct A {};
-    template<class> requires C1<int> struct B {};
-    template struct A<B>;
-  } // namespace t11
-} // namespace constraints
-
 namespace regression2 {
   template <class> struct D {};
 
diff --git a/clang/test/SemaTemplate/temp_arg_nontype.cpp b/clang/test/SemaTemplate/temp_arg_nontype.cpp
index 8b270b22a12b46..f360aa14950edd 100644
--- a/clang/test/SemaTemplate/temp_arg_nontype.cpp
+++ b/clang/test/SemaTemplate/temp_arg_nontype.cpp
@@ -387,11 +387,12 @@ namespace PR17696 {
 
 namespace partial_order_different_types {
   template<int, int, typename T, typename, T> struct A;
-  // expected-note@-1 {{template is declared here}}
-  template<int N, typename T, typename U, T V> struct A<0, N, T, U, V> {};
-  template<int N, typename T, typename U, U V> struct A<0, N, T, U, V>;
-  // expected-error@-1 {{class template partial specialization is not more specialized than the primary template}}
-  A<0, 0, int, int, 0> a;
+  template<int N, typename T, typename U, T V> struct A<0, N, T, U, V>; // expected-note {{matches}}
+  // FIXME: It appears that this partial specialization should be ill-formed as
+  // it is not more specialized than the primary template. V is not deducible
+  // because it does not have the same type as the corresponding parameter.
+  template<int N, typename T, typename U, U V> struct A<0, N, T, U, V> {}; // expected-note {{matches}}
+  A<0, 0, int, int, 0> a; // expected-error {{ambiguous}}
 }
 
 namespace partial_order_references {
@@ -457,24 +458,13 @@ namespace dependent_nested_partial_specialization {
 namespace nondependent_default_arg_ordering {
   int n, m;
   template<typename A, A B = &n> struct X {};
-
   template<typename A> void f(X<A>);
-  // expected-note@-1 {{candidate function}}
   template<typename A> void f(X<A, &m>);
-  // expected-note@-1 {{candidate function}}
   template<typename A, A B> void f(X<A, B>);
-  // expected-note@-1 2{{candidate function}}
   template<template<typename U, U> class T, typename A, int *B> void f(T<A, B>);
-  // expected-note@-1 2{{candidate function}}
-
-  // FIXME: When partial ordering, we get an inconsistent deduction between
-  // `A` (type-parameter-0-0) and `int *`, when deducing the first parameter.
-  // The deduction mechanism needs to be extended to be able to correctly
-  // handle these cases where the argument's template parameters appear in
-  // the result.
   void g() {
-    X<int *, &n> x; f(x); // expected-error {{call to 'f' is ambiguous}}
-    X<int *, &m> y; f(y); // expected-error {{call to 'f' is ambiguous}}
+    X<int *, &n> x; f(x);
+    X<int *, &m> y; f(y);
   }
 }
 
diff --git a/clang/test/SemaTemplate/temp_arg_template.cpp b/clang/test/SemaTemplate/temp_arg_template.cpp
index 9908af5e78669d..a7236669276aa3 100644
--- a/clang/test/SemaTemplate/temp_arg_template.cpp
+++ b/clang/test/SemaTemplate/temp_arg_template.cpp
@@ -1,40 +1,33 @@
 // RUN: %clang_cc1 -fsyntax-only -verify=expected,precxx17 %std_cxx98-14 %s
 // RUN: %clang_cc1 -fsyntax-only -verify=expected,cxx17 -std=c++17 %s
 
-template<template<typename T> class X> struct A; // #A
-// expected-note@-1 2{{previous template template parameter is here}}
+template<template<typename T> class X> struct A; // expected-note 2{{previous template template parameter is here}}
 
 template<template<typename T, int I> class X> struct B; // expected-note{{previous template template parameter is here}}
 
-template<template<int I> class X> struct C;
-// precxx17-error@-1 {{deduced non-type template argument does not have the same type as the corresponding template parameter ('int' vs 'const int &')}}
-// cxx17-error@-2 {{conversion from 'int' to 'const int &' in converted constant expression would bind reference to a temporary}}
-// expected-note@-3 {{previous template template parameter is here}}
+template<template<int I> class X> struct C;  // expected-note {{previous non-type template parameter with type 'int' is here}}
 
-template<class> struct X; // expected-note {{template is declared here}}
-template<int N> struct Y; // expected-note {{template parameter is declared here}}
+template<class> struct X; // expected-note{{too few template parameters in template template argument}}
+template<int N> struct Y; // expected-note{{template parameter has a different kind in template argument}}
 template<long N> struct Ylong;
-template<const int &N> struct Yref; // precxx17-note {{template parameter is declared here}}
+template<const int &N> struct Yref; // expected-note{{template non-type parameter has a different type 'const int &' in template argument}}
 
 namespace N {
   template<class> struct Z;
 }
-template<class, class> struct TooMany; // expected-note{{template is declared here}}
+template<class, class> struct TooMany; // expected-note{{too many template parameters in template template argument}}
 
 
 A<X> *a1;
 A<N::Z> *a2;
 A< ::N::Z> *a3;
 
-A<Y> *a4; // expected-error@#A {{template argument for non-type template parameter must be an expression}}
-          // expected-note@-1 {{different template parameters}}
-A<TooMany> *a5; // expected-error {{too few template arguments for class template 'TooMany'}}
-                // expected-note@-1 {{different template parameters}}
-B<X> *a6; // expected-error {{too many template arguments for class template 'X'}}
-          // expected-note@-1 {{different template parameters}}
+A<Y> *a4; // expected-error{{template template argument has different template parameters than its corresponding template template parameter}}
+A<TooMany> *a5; // expected-error{{template template argument has different template parameters than its corresponding template template parameter}}
+B<X> *a6; // expected-error{{template template argument has different template parameters than its corresponding template template parameter}}
 C<Y> *a7;
 C<Ylong> *a8;
-C<Yref> *a9; // expected-note {{different template parameters}}
+C<Yref> *a9; // expected-error{{template template argument has different template parameters than its corresponding template template parameter}}
 
 template<typename T> void f(int);
 
@@ -110,9 +103,9 @@ void foo() {
 
 namespace CheckDependentNonTypeParamTypes {
   template<template<typename T, typename U, T v> class X> struct A {
-    // expected-note@-1 {{previous template template parameter is here}}
     void f() {
-      X<int, void*, 3> x;
+      X<int, void*, 3> x; // precxx17-error {{does not refer to any declaration}} \
+                             cxx17-error {{value of type 'int' is not implicitly convertible to 'void *'}}
     }
     void g() {
       X<int, long, 3> x;
@@ -131,16 +124,15 @@ namespace CheckDependentNonTypeParamTypes {
     }
   };
 
-  template<typename T, typename U, U v> struct B {
-    // expected-error@-1 {{conflicting deduction 'U' against 'T' for parameter}}
+  template<typename T, typename U, U v> struct B { // precxx17-note {{parameter}}
     static const U value = v;
   };
 
   // FIXME: This should probably be rejected, but the rules are at best unclear.
-  A<B> ab; // expected-note {{different template parameters}}
+  A<B> ab;
 
   void use() {
-    ab.f();
+    ab.f(); // expected-note {{instantiation of}}
     ab.g();
     ab.h();
   }
diff --git a/clang/test/SemaTemplate/temp_arg_template_p0522.cpp b/clang/test/SemaTemplate/temp_arg_template_p0522.cpp
index d40577d5270468..6f6568b9ab7764 100644
--- a/clang/test/SemaTemplate/temp_arg_template_p0522.cpp
+++ b/clang/test/SemaTemplate/temp_arg_template_p0522.cpp
@@ -1,16 +1,14 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -std=c++20 %s
 
-// expected-note@temp_arg_template_p0522.cpp:* 1+{{template is declared here}}
-// expected-note@temp_arg_template_p0522.cpp:* 1+{{template parameter is declared here}}
-// expected-note@temp_arg_template_p0522.cpp:* 1+{{previous template template parameter is here}}
+// expected-note@temp_arg_template_p0522.cpp:* 1+{{}}
 
-template<template<int> typename> struct Ti; // #Ti
-template<template<int...> typename> struct TPi; // #TPi
+template<template<int> typename> struct Ti;
+template<template<int...> typename> struct TPi;
 template<template<int, int...> typename> struct TiPi;
 template<template<int..., int...> typename> struct TPiPi; // FIXME: Why is this not ill-formed?
 
-template<typename T, template<T> typename> struct tT0; // #tT0
-template<template<typename T, T> typename> struct Tt0; // #Tt0
+template<typename T, template<T> typename> struct tT0;
+template<template<typename T, T> typename> struct Tt0;
 
 template<template<typename> typename> struct Tt;
 template<template<typename, typename...> typename> struct TtPt;
@@ -21,8 +19,8 @@ template<int, int> struct ii;
 template<int...> struct Pi;
 template<int, int, int...> struct iiPi;
 
-template<int, typename = int> struct iDt; // #iDt
-template<int, typename> struct it; // #it
+template<int, typename = int> struct iDt;
+template<int, typename> struct it;
 
 template<typename T, T v> struct t0;
 
@@ -33,14 +31,10 @@ namespace IntParam {
         Ti<iDi>,
         Ti<Pi>,
         Ti<iDt>>;
-  using err1 = Ti<ii>; // expected-error {{too few template arguments for class template 'ii'}}
-                       // expected-note@-1 {{different template parameters}}
-  using err2 = Ti<iiPi>; // expected-error {{too few template arguments for class template 'iiPi'}}
-                         // expected-note@-1 {{different template parameters}}
-  using err3 = Ti<t0>; // expected-error@#Ti {{template argument for template type parameter must be a type}}
-                       // expected-note@-1 {{different template parameters}}
-  using err4 = Ti<it>; // expected-error {{too few template arguments for class template 'it'}}
-                       // expected-note@-1 {{different template parameters}}
+  using err1 = Ti<ii>; // expected-error {{different template parameters}}
+  using err2 = Ti<iiPi>; // expected-error {{different template parameters}}
+  using err3 = Ti<t0>; // expected-error {{different template parameters}}
+  using err4 = Ti<it>; // expected-error {{different template parameters}}
 }
 
 // These are accepted by the backwards-compatibility "parameter pack in
@@ -48,12 +42,9 @@ namespace IntParam {
 namespace IntPackParam {
   using ok = TPi<Pi>;
   using ok_compat = Pt<TPi<i>, TPi<iDi>, TPi<ii>, TPi<iiPi>>;
-  using err1 = TPi<t0>; // expected-error@#TPi {{template argument for template type parameter must be a type}}
-                        // expected-note@-1 {{different template parameters}}
-  using err2 = TPi<iDt>; // expected-error@#iDt {{could not match 'type-parameter-0-1' against}}
-                         // expected-note@-1 {{different template parameters}}
-  using err3 = TPi<it>; // expected-error@#it {{could not match 'type-parameter-0-1' against}}
-                        // expected-note@-1 {{different template parameters}}
+  using err1 = TPi<t0>; // expected-error {{different template parameters}}
+  using err2 = TPi<iDt>; // expected-error {{different template parameters}}
+  using err3 = TPi<it>; // expected-error {{different template parameters}}
 }
 
 namespace IntAndPackParam {
@@ -64,50 +55,42 @@ namespace IntAndPackParam {
 
 namespace DependentType {
   using ok = Pt<tT0<int, i>, tT0<int, iDi>>;
-  using err1 = tT0<int, ii>; // expected-error {{too few template arguments for class template 'ii'}}
-                             // expected-note@-1 {{different template parameters}}
+  using err1 = tT0<int, ii>; // expected-error {{different template parameters}}
   using err2 = tT0<short, i>; // FIXME: should this be OK?
   using err2a = tT0<long long, i>; // FIXME: should this be OK (if long long is larger than int)?
-  using err2b = tT0<void*, i>; // expected-error@#tT0 {{value of type 'void *' is not implicitly convertible to 'int'}}
-                               // expected-note@-1 {{different template parameters}}
-  using err3 = tT0<short, t0>; // expected-error@#tT0 {{template argument for template type parameter must be a type}}
-                               // expected-note@-1 {{different template parameters}}
+  using err2b = tT0<void*, i>; // expected-error {{different template parameters}}
+  using err3 = tT0<short, t0>; // expected-error {{different template parameters}}
 
   using ok2 = Tt0<t0>;
-  using err4 = Tt0<it>; // expected-error@#Tt0 {{template argument for non-type template parameter must be an expression}}
-                        // expected-note@-1 {{different template parameters}}
+  using err4 = Tt0<it>; // expected-error {{different template parameters}}
 }
 
 namespace Auto {
-  template<template<int> typename T> struct TInt {}; // #TInt
-  template<template<int*> typename T> struct TIntPtr {}; // #TIntPtr
+  template<template<int> typename T> struct TInt {};
+  template<template<int*> typename T> struct TIntPtr {};
   template<template<auto> typename T> struct TAuto {};
   template<template<auto*> typename T> struct TAutoPtr {};
   template<template<decltype(auto)> typename T> struct TDecltypeAuto {};
   template<auto> struct Auto;
-  template<auto*> struct AutoPtr; // #AutoPtr
+  template<auto*> struct AutoPtr;
   template<decltype(auto)> struct DecltypeAuto;
   template<int> struct Int;
   template<int*> struct IntPtr;
 
   TInt<Auto> ia;
-  TInt<AutoPtr> iap; // expected-error@#TInt {{non-type template parameter '' with type 'auto *' has incompatible initializer of type 'int'}}
-                     // expected-note@-1 {{different template parameters}}
+  TInt<AutoPtr> iap; // FIXME: ill-formed (?)
   TInt<DecltypeAuto> ida;
   TInt<Int> ii;
-  TInt<IntPtr> iip; // expected-error@#TInt {{conversion from 'int' to 'int *' is not allowed in a converted constant expression}}
-                    // expected-note@-1 {{different template parameters}}
+  TInt<IntPtr> iip; // expected-error {{different template parameters}}
 
   TIntPtr<Auto> ipa;
   TIntPtr<AutoPtr> ipap;
   TIntPtr<DecltypeAuto> ipda;
-  TIntPtr<Int> ipi; // expected-error@#TIntPtr {{value of type 'int *' is not implicitly convertible to 'int'}}
-                    // expected-note@-1 {{different template parameters}}
+  TIntPtr<Int> ipi; // expected-error {{different template parameters}}
   TIntPtr<IntPtr> ipip;
 
   TAuto<Auto> aa;
-  TAuto<AutoPtr> aap; // expected-error@#AutoPtr {{could not match 'auto *' against 'auto'}}
-                      // expected-note@-1 {{different template parameters}}
+  TAuto<AutoPtr> aap; // FIXME: ill-formed (?)
   TAuto<Int> ai; // FIXME: ill-formed (?)
   TAuto<IntPtr> aip; // FIXME: ill-formed (?)
 
@@ -128,8 +111,7 @@ namespace Auto {
   // parameters (such as 'user-defined-type &') that are not valid 'auto'
   // parameters.
   TDecltypeAuto<Auto> daa;
-  TDecltypeAuto<AutoPtr> daap; // expected-error@#AutoPtr {{could not match 'auto *' against 'decltype(auto)'}}
-                               // expected-note@-1 {{different template parameters}}
+  TDecltypeAuto<AutoPtr> daap; // FIXME: should probably be ill-formed
 
   int n;
   template<auto A, decltype(A) B = &n> struct SubstFailure;
@@ -146,7 +128,7 @@ namespace GH62529 {
 } // namespace GH62529
 
 namespace GH101394 {
-  struct X {}; // #X
+  struct X {};
   struct Y {
     constexpr Y(const X &) {}
   };
@@ -157,12 +139,8 @@ namespace GH101394 {
     template struct A<B>;
   } // namespace t1
   namespace t2 {
-    template<template<Y> class> struct A {}; // #A
-    template<X> struct B; // #B
-    template struct A<B>;
-    // expected-error@#A {{no viable conversion from 'const Y' to 'X'}}
-    // expected-note@-2  {{different template parameters}}
-    // expected-note@#X 2{{not viable}}
-    // expected-note@#B  {{passing argument to parameter here}}
+    template<template<Y> class> struct A {};
+    template<X> struct B;
+    template struct A<B>; // expected-error {{different template parameters}}
   } // namespace t2
 } // namespace GH101394
diff --git a/clang/test/Templight/templight-empty-entries-fix.cpp b/clang/test/Templight/templight-empty-entries-fix.cpp
index d13b748068efec..e17be9012e59cc 100644
--- a/clang/test/Templight/templight-empty-entries-fix.cpp
+++ b/clang/test/Templight/templight-empty-entries-fix.cpp
@@ -314,18 +314,6 @@ void foo() {
 // CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:289:35'$}}
 // CHECK: {{^poi:[ ]+''$}}
 // CHECK-LABEL: {{^---$}}
-// CHECK: {{^name:[ ]+unnamed template template parameter 0 of d$}}
-// CHECK: {{^kind:[ ]+PartialOrderingTTP$}}
-// CHECK: {{^event:[ ]+Begin$}}
-// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:289:35'$}}
-// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:295:5'$}}
-// CHECK-LABEL: {{^---$}}
-// CHECK: {{^name:[ ]+unnamed template template parameter 0 of d$}}
-// CHECK: {{^kind:[ ]+PartialOrderingTTP$}}
-// CHECK: {{^event:[ ]+End$}}
-// CHECK: {{^orig:[ ]+'.*templight-empty-entries-fix.cpp:289:35'$}}
-// CHECK: {{^poi:[ ]+'.*templight-empty-entries-fix.cpp:295:5'$}}
-// CHECK-LABEL: {{^---$}}
 // CHECK: {{^name:[ ]+d$}}
 // CHECK: {{^kind:[ ]+ExplicitTemplateArgumentSubstitution$}}
 // CHECK: {{^event:[ ]+End$}}
diff --git a/clang/test/Templight/templight-prior-template-arg.cpp b/clang/test/Templight/templight-prior-template-arg.cpp
index 14bcb6a4d48f6b..e9b1dd47bb603a 100644
--- a/clang/test/Templight/templight-prior-template-arg.cpp
+++ b/clang/test/Templight/templight-prior-template-arg.cpp
@@ -10,76 +10,63 @@ class B {};
 // CHECK: {{^kind:[ ]+PriorTemplateArgumentSubstitution$}}
 // CHECK: {{^event:[ ]+Begin$}}
 // CHECK: {{^orig:[ ]+'.*templight-prior-template-arg.cpp:5:40'}}
-// CHECK: {{^poi:[ ]+'.*templight-prior-template-arg.cpp:85:1'$}}
+// CHECK: {{^poi:[ ]+'.*templight-prior-template-arg.cpp:72:1'$}}
 // CHECK-LABEL: {{^---$}}
 // CHECK: {{^name:[ ]+'B::Outer'$}}
 // CHECK: {{^kind:[ ]+PriorTemplateArgumentSubstitution$}}
 // CHECK: {{^event:[ ]+End$}}
 // CHECK: {{^orig:[ ]+'.*templight-prior-template-arg.cpp:5:40'}}
-// CHECK: {{^poi:[ ]+'.*templight-prior-template-arg.cpp:85:1'$}}
-//
-// CHECK-LABEL: {{^---$}}
-// CHECK: {{^name:[ ]+'B::Outer'$}}
-// CHECK: {{^kind:[ ]+PartialOrderingTTP$}}
-// CHECK: {{^event:[ ]+Begin$}}
-// CHECK: {{^orig:[ ]+'.*templight-prior-template-arg.cpp:5:40'}}
-// CHECK: {{^poi:[ ]+'.*templight-prior-template-arg.cpp:85:3'$}}
-// CHECK-LABEL: {{^---$}}
-// CHECK: {{^name:[ ]+'B::Outer'$}}
-// CHECK: {{^kind:[ ]+PartialOrderingTTP$}}
-// CHECK: {{^event:[ ]+End$}}
-// CHECK: {{^orig:[ ]+'.*templight-prior-template-arg.cpp:5:40'}}
-// CHECK: {{^poi:[ ]+'.*templight-prior-template-arg.cpp:85:3'$}}
+// CHECK: {{^poi:[ ]+'.*templight-prior-template-arg.cpp:72:1'$}}
 //
 // CHECK-LABEL: {{^---$}}
 // CHECK: {{^name:[ ]+'B<A>'$}}
 // CHECK: {{^kind:[ ]+TemplateInstantiation$}}
 // CHECK: {{^event:[ ]+Begin$}}
 // CHECK: {{^orig:[ ]+'.*templight-prior-template-arg.cpp:6:7'}}
-// CHECK: {{^poi:[ ]+'.*templight-prior-template-arg.cpp:85:6'$}}
+// CHECK: {{^poi:[ ]+'.*templight-prior-template-arg.cpp:72:6'$}}
 // CHECK-LABEL: {{^---$}}
 // CHECK: {{^name:[ ]+'B<A>'$}}
 // CHECK: {{^kind:[ ]+TemplateInstantiation$}}
 // CHECK: {{^event:[ ]+End$}}
 // CHECK: {{^orig:[ ]+'.*templight-prior-template-arg.cpp:6:7'}}
-// CHECK: {{^poi:[ ]+'.*templight-prior-template-arg.cpp:85:6'$}}
+// CHECK: {{^poi:[ ]+'.*templight-prior-template-arg.cpp:72:6'$}}
 //
 // CHECK-LABEL: {{^---$}}
 // CHECK: {{^name:[ ]+'B<A>'$}}
 // CHECK: {{^kind:[ ]+TemplateInstantiation$}}
 // CHECK: {{^event:[ ]+Begin$}}
 // CHECK: {{^orig:[ ]+'.*templight-prior-template-arg.cpp:6:7'}}
-// CHECK: {{^poi:[ ]+'.*templight-prior-template-arg.cpp:85:6'$}}
+// CHECK: {{^poi:[ ]+'.*templight-prior-template-arg.cpp:72:6'$}}
 // CHECK-LABEL: {{^---$}}
 // CHECK: {{^name:[ ]+'B<A>'$}}
 // CHECK: {{^kind:[ ]+TemplateInstantiation$}}
 // CHECK: {{^event:[ ]+End$}}
 // CHECK: {{^orig:[ ]+'.*templight-prior-template-arg.cpp:6:7'}}
-// CHECK: {{^poi:[ ]+'.*templight-prior-template-arg.cpp:85:6'$}}
+// CHECK: {{^poi:[ ]+'.*templight-prior-template-arg.cpp:72:6'$}}
 //
 // CHECK-LABEL: {{^---$}}
 // CHECK: {{^name:[ ]+'B<A>'$}}
 // CHECK: {{^kind:[ ]+Memoization$}}
 // CHECK: {{^event:[ ]+Begin$}}
 // CHECK: {{^orig:[ ]+'.*templight-prior-template-arg.cpp:6:7'}}
-// CHECK: {{^poi:[ ]+'.*templight-prior-template-arg.cpp:85:6'$}}
+// CHECK: {{^poi:[ ]+'.*templight-prior-template-arg.cpp:72:6'$}}
 // CHECK-LABEL: {{^---$}}
 // CHECK: {{^name:[ ]+'B<A>'$}}
 // CHECK: {{^kind:[ ]+Memoization$}}
 // CHECK: {{^event:[ ]+End$}}
 // CHECK: {{^orig:[ ]+'.*templight-prior-template-arg.cpp:6:7'}}
-// CHECK: {{^poi:[ ]+'.*templight-prior-template-arg.cpp:85:6'$}}
+// CHECK: {{^poi:[ ]+'.*templight-prior-template-arg.cpp:72:6'$}}
 //
 // CHECK-LABEL: {{^---$}}
 // CHECK: {{^name:[ ]+'B<A>'$}}
 // CHECK: {{^kind:[ ]+Memoization$}}
 // CHECK: {{^event:[ ]+Begin$}}
 // CHECK: {{^orig:[ ]+'.*templight-prior-template-arg.cpp:6:7'}}
-// CHECK: {{^poi:[ ]+'.*templight-prior-template-arg.cpp:85:6'$}}
+// CHECK: {{^poi:[ ]+'.*templight-prior-template-arg.cpp:72:6'$}}
 // CHECK-LABEL: {{^---$}}
 // CHECK: {{^name:[ ]+'B<A>'$}}
 // CHECK: {{^kind:[ ]+Memoization$}}
 // CHECK: {{^event:[ ]+End$}}
 // CHECK: {{^orig:[ ]+'.*templight-prior-template-arg.cpp:6:7'}}
-// CHECK: {{^poi:[ ]+'.*templight-prior-template-arg.cpp:85:6'$}}
+// CHECK: {{^poi:[ ]+'.*templight-prior-template-arg.cpp:72:6'$}}
 B<A> b;

From 870d37dd1257d211f96797bf041280b2260ed260 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Fri, 11 Oct 2024 14:08:07 +0100
Subject: [PATCH 168/177] [AMDGPU] Rewrite RegSeqNames using !foreach. NFC.
 (#111994)

This reduces the total number of TableGen records produced by AMDGPU.td
by about 6%.
---
 llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index ef9adde13348fe..3556f6a95b521e 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -88,15 +88,10 @@ class getSubRegs<int size> {
 
 // Generates list of sequential register tuple names.
 // E.g. RegSeq<3,2,2,"s">.ret -> [ "s[0:1]", "s[2:3]" ]
-class RegSeqNames<int last_reg, int stride, int size, string prefix,
-                  int start = 0> {
-  int next = !add(start, stride);
-  int end_reg = !add(start, size, -1);
-  list<string> ret =
-    !if(!le(end_reg, last_reg),
-        !listconcat([prefix # "[" # start # ":" # end_reg # "]"],
-                    RegSeqNames<last_reg, stride, size, prefix, next>.ret),
-                    []);
+class RegSeqNames<int last_reg, int stride, int size, string prefix> {
+  defvar numtuples = !div(!sub(!add(last_reg, stride, 1), size), stride);
+  defvar range = !range(0, !mul(numtuples, stride), stride);
+  list<string> ret = !foreach(n, range, prefix # "[" # n # ":" # !add(n, size, -1) # "]");
 }
 
 // Generates list of dags for register tuples.

From c8554e13eec048180d003af2aa7b2cc8498d4fba Mon Sep 17 00:00:00 2001
From: Erich Keane <ekeane@nvidia.com>
Date: Fri, 11 Oct 2024 06:10:03 -0700
Subject: [PATCH 169/177] Turn `-Wdeprecated-literal-operator` on by default
 (#111027)

It would be nice to see what our users think about this change, as this
is something that WG21/EWG quite wants to fix a handful of questionable
issues with UB. Depending on the outcome of this after being committed,
we might instead suggest EWG undeprecate this, and require a bit of
'magic' from the lexer.

Additionally, this patch makes it so we emit this diagnostic ALSO in
cases where the literal name is reserved. It doesn't make sense to limit
that.

---------

Co-authored-by: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
---
 clang/docs/ReleaseNotes.rst                   | 22 +++++++
 .../clang/Basic/DiagnosticSemaKinds.td        |  2 +-
 clang/lib/Basic/IdentifierTable.cpp           |  3 +
 clang/lib/Sema/SemaExprCXX.cpp                | 28 ++++----
 clang/test/CXX/drs/cwg14xx.cpp                |  2 +-
 clang/test/CXX/drs/cwg25xx.cpp                |  3 +
 clang/test/CXX/lex/lex.literal/lex.ext/p1.cpp |  2 +-
 .../test/CXX/lex/lex.literal/lex.ext/p10.cpp  |  2 +-
 .../test/CXX/lex/lex.literal/lex.ext/p11.cpp  |  6 +-
 clang/test/CXX/lex/lex.literal/lex.ext/p3.cpp | 10 +--
 clang/test/CXX/lex/lex.literal/lex.ext/p4.cpp | 10 +--
 clang/test/CXX/lex/lex.literal/lex.ext/p5.cpp |  8 +--
 clang/test/CXX/lex/lex.literal/lex.ext/p6.cpp |  8 +--
 clang/test/CXX/lex/lex.literal/lex.ext/p7.cpp |  6 +-
 clang/test/CXX/lex/lex.literal/lex.ext/p8.cpp |  6 +-
 clang/test/CXX/lex/lex.literal/lex.ext/p9.cpp |  2 +-
 .../CXX/over/over.oper/over.literal/p2.cpp    | 28 ++++----
 .../CXX/over/over.oper/over.literal/p3.cpp    | 66 +++++++++----------
 .../CXX/over/over.oper/over.literal/p5.cpp    | 20 +++---
 .../CXX/over/over.oper/over.literal/p6.cpp    | 12 ++--
 .../CXX/over/over.oper/over.literal/p7.cpp    | 10 +--
 .../CXX/over/over.oper/over.literal/p8.cpp    | 10 +--
 clang/test/FixIt/fixit-c++11.cpp              |  4 +-
 .../Parser/cxx11-user-defined-literals.cpp    | 29 ++++----
 .../cxx11-user-defined-literals-unused.cpp    |  4 +-
 .../SemaCXX/cxx11-user-defined-literals.cpp   | 60 ++++++++---------
 clang/test/SemaCXX/cxx2a-consteval.cpp        |  6 +-
 clang/test/SemaCXX/cxx98-compat.cpp           |  2 +-
 clang/test/SemaCXX/literal-operators.cpp      | 46 ++++++-------
 ...ser-defined-literals-in-system-headers.cpp |  2 +-
 clang/test/SemaCXX/reserved-identifier.cpp    |  8 ++-
 clang/test/SemaCXX/warn-xor-as-pow.cpp        |  6 +-
 32 files changed, 236 insertions(+), 197 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index a7500196399622..7063fea41efda1 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -99,6 +99,20 @@ C++ Specific Potentially Breaking Changes
     // Was error, now evaluates to false.
     constexpr bool b = f() == g();
 
+- The warning ``-Wdeprecated-literal-operator`` is now on by default, as this is
+  something that WG21 has shown interest in removing from the language. The
+  result is that anyone who is compiling with ``-Werror`` should see this
+  diagnostic.  To fix this diagnostic, simply removing the space character from
+  between the ``operator""`` and the user defined literal name will make the
+  source no longer deprecated. This is consistent with `CWG2521 <https://cplusplus.github.io/CWG/issues/2521.html>_`.
+
+  .. code-block:: c++
+
+    // Now diagnoses by default.
+    unsigned operator"" _udl_name(unsigned long long);
+    // Fixed version:
+    unsigned operator""_udl_name(unsigned long long);
+
 ABI Changes in This Version
 ---------------------------
 
@@ -215,6 +229,10 @@ Resolutions to C++ Defect Reports
 - Clang now allows trailing requires clause on explicit deduction guides.
   (`CWG2707: Deduction guides cannot have a trailing requires-clause <https://cplusplus.github.io/CWG/issues/2707.html>`_).
 
+- Clang now diagnoses a space in the first production of a ``literal-operator-id``
+  by default.
+  (`CWG2521: User-defined literals and reserved identifiers <https://cplusplus.github.io/CWG/issues/2521.html>`_).
+
 C Language Changes
 ------------------
 
@@ -378,6 +396,10 @@ Improvements to Clang's diagnostics
 - The warning for an unsupported type for a named register variable is now phrased ``unsupported type for named register variable``,
   instead of ``bad type for named register variable``. This makes it clear that the type is not supported at all, rather than being
   suboptimal in some way the error fails to mention (#GH111550).
+  
+- Clang now emits a ``-Wdepredcated-literal-operator`` diagnostic, even if the
+  name was a reserved name, which we improperly allowed to suppress the
+  diagnostic.
 
 Improvements to Clang's time-trace
 ----------------------------------
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 777ea1f37cea46..41cdd09e971651 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -439,7 +439,7 @@ def warn_reserved_extern_symbol: Warning<
   InGroup<ReservedIdentifier>, DefaultIgnore;
 def warn_deprecated_literal_operator_id: Warning<
   "identifier %0 preceded by whitespace in a literal operator declaration "
-  "is deprecated">, InGroup<DeprecatedLiteralOperator>, DefaultIgnore;
+  "is deprecated">, InGroup<DeprecatedLiteralOperator>;
 def warn_reserved_module_name : Warning<
   "%0 is a reserved name for a module">, InGroup<ReservedModuleIdentifier>;
 def warn_import_implementation_partition_unit_in_interface_unit : Warning<
diff --git a/clang/lib/Basic/IdentifierTable.cpp b/clang/lib/Basic/IdentifierTable.cpp
index c9c9d927a5902e..16151c94464f99 100644
--- a/clang/lib/Basic/IdentifierTable.cpp
+++ b/clang/lib/Basic/IdentifierTable.cpp
@@ -406,6 +406,9 @@ ReservedLiteralSuffixIdStatus
 IdentifierInfo::isReservedLiteralSuffixId() const {
   StringRef Name = getName();
 
+  // Note: the diag::warn_deprecated_literal_operator_id diagnostic depends on
+  // this being the first check we do, so if this order changes, we have to fix
+  // that as well.
   if (Name[0] != '_')
     return ReservedLiteralSuffixIdStatus::NotStartsWithUnderscore;
 
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 8e9bcb10a80b46..d39a545b66c151 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -503,17 +503,23 @@ bool Sema::checkLiteralOperatorId(const CXXScopeSpec &SS,
     const IdentifierInfo *II = Name.Identifier;
     ReservedIdentifierStatus Status = II->isReserved(PP.getLangOpts());
     SourceLocation Loc = Name.getEndLoc();
-    if (!PP.getSourceManager().isInSystemHeader(Loc)) {
-      if (auto Hint = FixItHint::CreateReplacement(
-              Name.getSourceRange(),
-              (StringRef("operator\"\"") + II->getName()).str());
-          isReservedInAllContexts(Status)) {
-        Diag(Loc, diag::warn_reserved_extern_symbol)
-            << II << static_cast<int>(Status) << Hint;
-      } else {
-        Diag(Loc, diag::warn_deprecated_literal_operator_id) << II << Hint;
-      }
-    }
+
+    auto Hint = FixItHint::CreateReplacement(
+        Name.getSourceRange(),
+        (StringRef("operator\"\"") + II->getName()).str());
+
+    // Only emit this diagnostic if we start with an underscore, else the
+    // diagnostic for C++11 requiring a space between the quotes and the
+    // identifier conflicts with this and gets confusing. The diagnostic stating
+    // this is a reserved name should force the underscore, which gets this
+    // back.
+    if (II->isReservedLiteralSuffixId() !=
+        ReservedLiteralSuffixIdStatus::NotStartsWithUnderscore)
+      Diag(Loc, diag::warn_deprecated_literal_operator_id) << II << Hint;
+
+    if (isReservedInAllContexts(Status))
+      Diag(Loc, diag::warn_reserved_extern_symbol)
+          << II << static_cast<int>(Status) << Hint;
   }
 
   if (!SS.isValid())
diff --git a/clang/test/CXX/drs/cwg14xx.cpp b/clang/test/CXX/drs/cwg14xx.cpp
index 5301185d046982..cb2f34bf5e427f 100644
--- a/clang/test/CXX/drs/cwg14xx.cpp
+++ b/clang/test/CXX/drs/cwg14xx.cpp
@@ -627,7 +627,7 @@ int i = N::f();
 
 namespace cwg1479 { // cwg1479: 3.1
 #if __cplusplus >= 201103L
-  int operator"" _a(const char*, std::size_t = 0);
+  int operator""_a(const char*, std::size_t = 0);
   // since-cxx11-error@-1 {{literal operator cannot have a default argument}}
 #endif
 }
diff --git a/clang/test/CXX/drs/cwg25xx.cpp b/clang/test/CXX/drs/cwg25xx.cpp
index 1924008f15ba58..87a728088ee6e4 100644
--- a/clang/test/CXX/drs/cwg25xx.cpp
+++ b/clang/test/CXX/drs/cwg25xx.cpp
@@ -88,6 +88,9 @@ operator""  _div();
 using ::cwg2521::operator"" _\u03C0___;
 using ::cwg2521::operator""_div;
 // since-cxx11-warning@-2 {{identifier '_π___' preceded by whitespace in a literal operator declaration is deprecated}}
+
+long double operator"" _RESERVED(long double);
+// since-cxx11-warning@-1 {{identifier '_RESERVED' preceded by whitespace in a literal operator declaration is deprecated}}
 #pragma clang diagnostic pop
 #endif
 } // namespace cwg2521
diff --git a/clang/test/CXX/lex/lex.literal/lex.ext/p1.cpp b/clang/test/CXX/lex/lex.literal/lex.ext/p1.cpp
index 1c227a1b10d385..ec478fbba60a18 100644
--- a/clang/test/CXX/lex/lex.literal/lex.ext/p1.cpp
+++ b/clang/test/CXX/lex/lex.literal/lex.ext/p1.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -std=c++11 -verify %s
+// RUN: %clang_cc1 -fsyntax-only -std=c++11 -Wno-deprecated-literal-operator -verify %s
 
 void operator "" p31(long double); // expected-warning{{user-defined literal suffixes not starting with '_' are reserved}}
 void operator "" _p31(long double);
diff --git a/clang/test/CXX/lex/lex.literal/lex.ext/p10.cpp b/clang/test/CXX/lex/lex.literal/lex.ext/p10.cpp
index 1b5d3880cb6129..6a9d713ca72d2e 100644
--- a/clang/test/CXX/lex/lex.literal/lex.ext/p10.cpp
+++ b/clang/test/CXX/lex/lex.literal/lex.ext/p10.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++11 -verify %s
+// RUN: %clang_cc1 -std=c++11 -Wno-deprecated-literal-operator -verify %s
 
 using size_t = decltype(sizeof(int));
 void operator "" wibble(const char *); // expected-warning {{user-defined literal suffixes not starting with '_' are reserved; no literal will invoke this operator}}
diff --git a/clang/test/CXX/lex/lex.literal/lex.ext/p11.cpp b/clang/test/CXX/lex/lex.literal/lex.ext/p11.cpp
index 8b5fcf4b609b65..d69a58a7dfad20 100644
--- a/clang/test/CXX/lex/lex.literal/lex.ext/p11.cpp
+++ b/clang/test/CXX/lex/lex.literal/lex.ext/p11.cpp
@@ -6,16 +6,16 @@ template<typename T, typename U> struct same_type;
 template<typename T> struct same_type<T, T> {};
 template<typename T> using X = T;
 template<typename CharT, X<CharT>...>
-int operator "" _x(); // expected-warning {{string literal operator templates are a GNU extension}}
+int operator ""_x(); // expected-warning {{string literal operator templates are a GNU extension}}
 template<char...>
-double operator "" _x();
+double operator ""_x();
 
 auto a="string"_x;
 auto b=42_x;
 same_type<decltype(a), int> test_a;
 same_type<decltype(b), double> test_b;
 
-char operator "" _x(const char *begin, size_t size);
+char operator ""_x(const char *begin, size_t size);
 auto c="string"_x;
 auto d=L"string"_x;
 same_type<decltype(c), char> test_c;
diff --git a/clang/test/CXX/lex/lex.literal/lex.ext/p3.cpp b/clang/test/CXX/lex/lex.literal/lex.ext/p3.cpp
index d764989312c246..e5ab09c628bcfa 100644
--- a/clang/test/CXX/lex/lex.literal/lex.ext/p3.cpp
+++ b/clang/test/CXX/lex/lex.literal/lex.ext/p3.cpp
@@ -1,18 +1,18 @@
 // RUN: %clang_cc1 -fsyntax-only -std=c++11 -verify %s
 
-int &operator "" _x1 (unsigned long long);
+int &operator ""_x1 (unsigned long long);
 int &i1 = 0x123_x1;
 
-double &operator "" _x1 (const char *);
+double &operator ""_x1 (const char *);
 int &i2 = 45_x1;
 
-template<char...> char &operator "" _x1 ();
+template<char...> char &operator ""_x1 ();
 int &i3 = 0377_x1;
 
 int &i4 = 90000000000000000000000000000000000000000000000_x1; // expected-error {{integer literal is too large to be represented in any integer type}}
 
-double &operator "" _x2 (const char *);
+double &operator ""_x2 (const char *);
 double &i5 = 123123123123123123123123123123123123123123123_x2;
 
-template<char...Cs> constexpr int operator "" _x3() { return sizeof...(Cs); }
+template<char...Cs> constexpr int operator ""_x3() { return sizeof...(Cs); }
 static_assert(123456789012345678901234567890123456789012345678901234567890_x3 == 60, "");
diff --git a/clang/test/CXX/lex/lex.literal/lex.ext/p4.cpp b/clang/test/CXX/lex/lex.literal/lex.ext/p4.cpp
index 011e832c69d729..7dbe70ce084e76 100644
--- a/clang/test/CXX/lex/lex.literal/lex.ext/p4.cpp
+++ b/clang/test/CXX/lex/lex.literal/lex.ext/p4.cpp
@@ -1,18 +1,18 @@
 // RUN: %clang_cc1 -fsyntax-only -std=c++11 -verify %s
 
-int &operator "" _x1 (long double);
+int &operator ""_x1 (long double);
 int &i1 = 0.123_x1;
 
-double &operator "" _x1 (const char *);
+double &operator ""_x1 (const char *);
 int &i2 = 45._x1;
 
-template<char...> char &operator "" _x1 ();
+template<char...> char &operator ""_x1 ();
 int &i3 = 0377e-1_x1;
 
 int &i4 = 1e1000000_x1; // expected-warning {{too large for type 'long double'}}
 
-double &operator "" _x2 (const char *);
+double &operator ""_x2 (const char *);
 double &i5 = 1e1000000_x2;
 
-template<char...Cs> constexpr int operator "" _x3() { return sizeof...(Cs); }
+template<char...Cs> constexpr int operator ""_x3() { return sizeof...(Cs); }
 static_assert(1e1000000_x3 == 9, "");
diff --git a/clang/test/CXX/lex/lex.literal/lex.ext/p5.cpp b/clang/test/CXX/lex/lex.literal/lex.ext/p5.cpp
index aee20545ececfc..afadba282e626c 100644
--- a/clang/test/CXX/lex/lex.literal/lex.ext/p5.cpp
+++ b/clang/test/CXX/lex/lex.literal/lex.ext/p5.cpp
@@ -3,19 +3,19 @@
 
 using size_t = decltype(sizeof(int));
 
-int &operator "" _x1 (const char *);
-double &operator "" _x1 (const char *, size_t);
+int &operator ""_x1 (const char *);
+double &operator ""_x1 (const char *, size_t);
 double &i1 = "foo"_x1;
 #if __cplusplus >= 202002L
 using char8 = float;
-float &operator "" _x1 (const char8_t *, size_t);
+float &operator ""_x1 (const char8_t *, size_t);
 #else
 using char8 = double;
 #endif
 char8 &i2 = u8"foo"_x1;
 double &i3 = L"foo"_x1; // expected-error {{no matching literal operator for call to 'operator""_x1' with arguments of types 'const wchar_t *' and 'unsigned long'}}
 
-char &operator "" _x1(const wchar_t *, size_t);
+char &operator ""_x1(const wchar_t *, size_t);
 char &i4 = L"foo"_x1; // ok
 double &i5 = R"(foo)"_x1; // ok
 char8 &i6 = u\
diff --git a/clang/test/CXX/lex/lex.literal/lex.ext/p6.cpp b/clang/test/CXX/lex/lex.literal/lex.ext/p6.cpp
index 23cd7081d5e3ee..b1df641f2dc43c 100644
--- a/clang/test/CXX/lex/lex.literal/lex.ext/p6.cpp
+++ b/clang/test/CXX/lex/lex.literal/lex.ext/p6.cpp
@@ -2,13 +2,13 @@
 
 using size_t = decltype(sizeof(int));
 
-int &operator "" _x1 (const char *);
+int &operator ""_x1 (const char *);
 double &i1 = 'a'_x1; // expected-error {{no matching literal operator}}
-double &operator "" _x1 (wchar_t);
+double &operator ""_x1 (wchar_t);
 double &i2 = L'a'_x1;
 double &i3 = 'a'_x1; // expected-error {{no matching literal operator}}
-double &i4 = operator"" _x1('a'); // ok
+double &i4 = operator""_x1('a'); // ok
 
-char &operator "" _x1(char16_t);
+char &operator ""_x1(char16_t);
 char &i5 = u'a'_x1; // ok
 double &i6 = L'a'_x1; // ok
diff --git a/clang/test/CXX/lex/lex.literal/lex.ext/p7.cpp b/clang/test/CXX/lex/lex.literal/lex.ext/p7.cpp
index 0b40ecdc143fcb..d571fcb8697eb0 100644
--- a/clang/test/CXX/lex/lex.literal/lex.ext/p7.cpp
+++ b/clang/test/CXX/lex/lex.literal/lex.ext/p7.cpp
@@ -10,9 +10,9 @@ template<typename T> struct same_type<T, T> {};
 
 namespace std_example {
 
-long double operator "" _w(long double);
-std::string operator "" _w(const char16_t*, size_t);
-unsigned operator "" _w(const char*);
+long double operator ""_w(long double);
+std::string operator ""_w(const char16_t*, size_t);
+unsigned operator ""_w(const char*);
 int main() {
   auto v1 = 1.2_w;    // calls operator""_w(1.2L)
   auto v2 = u"one"_w; // calls operator""_w(u"one", 3)
diff --git a/clang/test/CXX/lex/lex.literal/lex.ext/p8.cpp b/clang/test/CXX/lex/lex.literal/lex.ext/p8.cpp
index d9078221ff5e3a..67d976263e0167 100644
--- a/clang/test/CXX/lex/lex.literal/lex.ext/p8.cpp
+++ b/clang/test/CXX/lex/lex.literal/lex.ext/p8.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -std=c++11 -verify %s
 
 using size_t = decltype(sizeof(int));
-constexpr const char *operator "" _id(const char *p, size_t) { return p; }
+constexpr const char *operator ""_id(const char *p, size_t) { return p; }
 constexpr const char *s = "foo"_id "bar" "baz"_id "quux";
 
 constexpr bool streq(const char *p, const char *q) {
@@ -9,8 +9,8 @@ constexpr bool streq(const char *p, const char *q) {
 }
 static_assert(streq(s, "foobarbazquux"), "");
 
-constexpr const char *operator "" _trim(const char *p, size_t n) {
-  return *p == ' ' ? operator "" _trim(p + 1, n - 1) : p;
+constexpr const char *operator ""_trim(const char *p, size_t n) {
+  return *p == ' ' ? operator ""_trim(p + 1, n - 1) : p;
 }
 constexpr const char *t = "   " " "_trim "  foo";
 static_assert(streq(t, "foo"), "");
diff --git a/clang/test/CXX/lex/lex.literal/lex.ext/p9.cpp b/clang/test/CXX/lex/lex.literal/lex.ext/p9.cpp
index 65e27b41b06680..fbdedd119d3d68 100644
--- a/clang/test/CXX/lex/lex.literal/lex.ext/p9.cpp
+++ b/clang/test/CXX/lex/lex.literal/lex.ext/p9.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify %s
 
 using size_t = decltype(sizeof(int));
-void operator "" _x(const wchar_t *, size_t);
+void operator ""_x(const wchar_t *, size_t);
 
 namespace std_example {
 
diff --git a/clang/test/CXX/over/over.oper/over.literal/p2.cpp b/clang/test/CXX/over/over.oper/over.literal/p2.cpp
index f3ebadd2b8b9bc..cf26806e9e7d75 100644
--- a/clang/test/CXX/over/over.oper/over.literal/p2.cpp
+++ b/clang/test/CXX/over/over.oper/over.literal/p2.cpp
@@ -1,43 +1,43 @@
 // RUN: %clang_cc1 -std=c++11 %s -verify
 
-void operator "" _a(const char *);
+void operator ""_a(const char *);
 
 namespace N {
-  using ::operator "" _a;
+  using ::operator ""_a;
 
-  void operator "" _b(const char *);
+  void operator ""_b(const char *);
 }
 
-using N::operator "" _b;
+using N::operator ""_b;
 
 class C {
-  void operator "" _c(const char *); // expected-error {{must be in a namespace or global scope}}
+  void operator ""_c(const char *); // expected-error {{must be in a namespace or global scope}}
 
-  static void operator "" _c(unsigned long long); // expected-error {{must be in a namespace or global scope}}
+  static void operator ""_c(unsigned long long); // expected-error {{must be in a namespace or global scope}}
 
-  friend void operator "" _d(const char *);
+  friend void operator ""_d(const char *);
 };
 
-int operator "" _e; // expected-error {{cannot be the name of a variable}}
+int operator ""_e; // expected-error {{cannot be the name of a variable}}
 
 void f() {
-  int operator "" _f; // expected-error {{cannot be the name of a variable}}
+  int operator ""_f; // expected-error {{cannot be the name of a variable}}
 }
 
 extern "C++" {
-  void operator "" _g(const char *);
+  void operator ""_g(const char *);
 }
 
-template<char...> void operator "" _h() {}
+template<char...> void operator ""_h() {}
 
-template<> void operator "" _h<'a', 'b', 'c'>() {}
+template<> void operator ""_h<'a', 'b', 'c'>() {}
 
-template void operator "" _h<'a', 'b', 'c', 'd'>();
+template void operator ""_h<'a', 'b', 'c', 'd'>();
 
 namespace rdar13605348 {
 
 class C {
-  double operator"" _x(long double value) { return double(value); } // expected-error{{literal operator 'operator""_x' must be in a namespace or global scope}}
+  double operator""_x(long double value) { return double(value); } // expected-error{{literal operator 'operator""_x' must be in a namespace or global scope}}
   double value() { return 3.2_x; } // expected-error{{no matching literal operator for call to}}
 };
 
diff --git a/clang/test/CXX/over/over.oper/over.literal/p3.cpp b/clang/test/CXX/over/over.oper/over.literal/p3.cpp
index 674ace9aee1929..53ebe102630843 100644
--- a/clang/test/CXX/over/over.oper/over.literal/p3.cpp
+++ b/clang/test/CXX/over/over.oper/over.literal/p3.cpp
@@ -3,38 +3,38 @@
 using size_t = decltype(sizeof(int));
 
 // Acceptable parameter declarations
-char operator "" _a(const char *);
-char operator "" _a(const char []);
-char operator "" _a(unsigned long long);
-char operator "" _a(long double);
-char operator "" _a(char);
-char operator "" _a(const volatile char);
-char operator "" _a(wchar_t);
-char operator "" _a(char16_t);
-char operator "" _a(char32_t);
-char operator "" _a(const char *, size_t);
-char operator "" _a(const wchar_t *, size_t);
-char operator "" _a(const char16_t *, size_t);
-char operator "" _a(const char32_t *, size_t);
-char operator "" _a(const char [32], size_t);
+char operator ""_a(const char *);
+char operator ""_a(const char []);
+char operator ""_a(unsigned long long);
+char operator ""_a(long double);
+char operator ""_a(char);
+char operator ""_a(const volatile char);
+char operator ""_a(wchar_t);
+char operator ""_a(char16_t);
+char operator ""_a(char32_t);
+char operator ""_a(const char *, size_t);
+char operator ""_a(const wchar_t *, size_t);
+char operator ""_a(const char16_t *, size_t);
+char operator ""_a(const char32_t *, size_t);
+char operator ""_a(const char [32], size_t);
 
 // Unacceptable parameter declarations
-char operator "" _b(); // expected-error {{parameter}}
-char operator "" _b(const wchar_t *); // expected-error {{parameter}}
-char operator "" _b(long long); // expected-error {{parameter}}
-char operator "" _b(double); // expected-error {{parameter}}
-char operator "" _b(short); // expected-error {{parameter}}
-char operator "" _a(char, int = 0); // expected-error {{parameter}}
-char operator "" _b(unsigned short); // expected-error {{parameter}}
-char operator "" _b(signed char); // expected-error {{parameter}}
-char operator "" _b(unsigned char); // expected-error {{parameter}}
-char operator "" _b(const short *, size_t); // expected-error {{parameter}}
-char operator "" _b(const unsigned short *, size_t); // expected-error {{parameter}}
-char operator "" _b(const signed char *, size_t); // expected-error {{parameter}}
-char operator "" _b(const unsigned char *, size_t); // expected-error {{parameter}}
-char operator "" _a(const volatile char *, size_t); // expected-error {{parameter}}
-char operator "" _a(volatile wchar_t *, size_t); // expected-error {{parameter}}
-char operator "" _a(char16_t *, size_t); // expected-error {{parameter}}
-char operator "" _a(const char32_t *, size_t, bool = false); // expected-error {{parameter}}
-char operator "" _a(const char *, signed long); // expected-error {{parameter}}
-char operator "" _a(const char *, size_t = 0); // expected-error {{default argument}}
+char operator ""_b(); // expected-error {{parameter}}
+char operator ""_b(const wchar_t *); // expected-error {{parameter}}
+char operator ""_b(long long); // expected-error {{parameter}}
+char operator ""_b(double); // expected-error {{parameter}}
+char operator ""_b(short); // expected-error {{parameter}}
+char operator ""_a(char, int = 0); // expected-error {{parameter}}
+char operator ""_b(unsigned short); // expected-error {{parameter}}
+char operator ""_b(signed char); // expected-error {{parameter}}
+char operator ""_b(unsigned char); // expected-error {{parameter}}
+char operator ""_b(const short *, size_t); // expected-error {{parameter}}
+char operator ""_b(const unsigned short *, size_t); // expected-error {{parameter}}
+char operator ""_b(const signed char *, size_t); // expected-error {{parameter}}
+char operator ""_b(const unsigned char *, size_t); // expected-error {{parameter}}
+char operator ""_a(const volatile char *, size_t); // expected-error {{parameter}}
+char operator ""_a(volatile wchar_t *, size_t); // expected-error {{parameter}}
+char operator ""_a(char16_t *, size_t); // expected-error {{parameter}}
+char operator ""_a(const char32_t *, size_t, bool = false); // expected-error {{parameter}}
+char operator ""_a(const char *, signed long); // expected-error {{parameter}}
+char operator ""_a(const char *, size_t = 0); // expected-error {{default argument}}
diff --git a/clang/test/CXX/over/over.oper/over.literal/p5.cpp b/clang/test/CXX/over/over.oper/over.literal/p5.cpp
index bfad5f00cf6c75..593aa57b76a81c 100644
--- a/clang/test/CXX/over/over.oper/over.literal/p5.cpp
+++ b/clang/test/CXX/over/over.oper/over.literal/p5.cpp
@@ -3,20 +3,20 @@
 using size_t = decltype(sizeof(int));
 template<char...> struct S {};
 
-template<char...> void operator "" _a();
-template<char... C> S<C...> operator "" _a();
+template<char...> void operator ""_a();
+template<char... C> S<C...> operator ""_a();
 
 template<typename T> struct U {
-  friend int operator "" _a(const char *, size_t);
+  friend int operator ""_a(const char *, size_t);
   // FIXME: It's not entirely clear whether this is intended to be legal.
-  friend U operator "" _a(const T *, size_t); // expected-error {{parameter}}
+  friend U operator ""_a(const T *, size_t); // expected-error {{parameter}}
 };
 template<char...> struct V {
-  friend void operator "" _b(); // expected-error {{parameters}}
+  friend void operator ""_b(); // expected-error {{parameters}}
 };
 
-template<char... C, int N = 0> void operator "" _b(); // expected-error {{template}}
-template<char... C> void operator "" _b(int N = 0); // expected-error {{template}}
-template<char, char...> void operator "" _b(); // expected-error {{template}}
-template<typename T> T operator "" _b(const char *); // expected-error {{template}}
-template<typename T> int operator "" _b(const T *, size_t); // expected-error {{template}}
+template<char... C, int N = 0> void operator ""_b(); // expected-error {{template}}
+template<char... C> void operator ""_b(int N = 0); // expected-error {{template}}
+template<char, char...> void operator ""_b(); // expected-error {{template}}
+template<typename T> T operator ""_b(const char *); // expected-error {{template}}
+template<typename T> int operator ""_b(const T *, size_t); // expected-error {{template}}
diff --git a/clang/test/CXX/over/over.oper/over.literal/p6.cpp b/clang/test/CXX/over/over.oper/over.literal/p6.cpp
index 9ecf9ccccb14c6..265050e0a00967 100644
--- a/clang/test/CXX/over/over.oper/over.literal/p6.cpp
+++ b/clang/test/CXX/over/over.oper/over.literal/p6.cpp
@@ -1,15 +1,15 @@
 // RUN: %clang_cc1 -std=c++11 %s -verify
 
 // expected-note@+1 {{extern "C" language linkage specification begins here}}
-extern "C" void operator "" _a(const char *); // expected-error {{must have C++ linkage}}
-extern "C" template<char...> void operator "" _b(); // expected-error {{must have C++ linkage}}
+extern "C" void operator ""_a(const char *); // expected-error {{must have C++ linkage}}
+extern "C" template<char...> void operator ""_b(); // expected-error {{must have C++ linkage}}
 // expected-note@-1 {{extern "C" language linkage specification begins here}}
 
 extern "C" { // expected-note 4 {{extern "C" language linkage specification begins here}}
-  void operator "" _c(const char *); // expected-error {{must have C++ linkage}}
-  template<char...> void operator "" _d(); // expected-error {{must have C++ linkage}}
+  void operator ""_c(const char *); // expected-error {{must have C++ linkage}}
+  template<char...> void operator ""_d(); // expected-error {{must have C++ linkage}}
   namespace N {
-    void operator "" _e(const char *); // expected-error {{must have C++ linkage}}
-    template<char...> void operator "" _f(); // expected-error {{must have C++ linkage}}
+    void operator ""_e(const char *); // expected-error {{must have C++ linkage}}
+    template<char...> void operator ""_f(); // expected-error {{must have C++ linkage}}
   }
 }
diff --git a/clang/test/CXX/over/over.oper/over.literal/p7.cpp b/clang/test/CXX/over/over.oper/over.literal/p7.cpp
index 74e9457bb55c38..244f043d4412e7 100644
--- a/clang/test/CXX/over/over.oper/over.literal/p7.cpp
+++ b/clang/test/CXX/over/over.oper/over.literal/p7.cpp
@@ -1,17 +1,17 @@
 // RUN: %clang_cc1 -std=c++11 %s -verify
 // expected-no-diagnostics
 
-constexpr int operator "" _a(const char *c) {
+constexpr int operator ""_a(const char *c) {
   return c[0];
 }
 
-static_assert(operator "" _a("foo") == 'f', "");
+static_assert(operator ""_a("foo") == 'f', "");
 
 void puts(const char *);
-static inline void operator "" _puts(const char *c) {
+static inline void operator ""_puts(const char *c) {
   puts(c);
 }
 void f() {
-  operator "" _puts("foo");
-  operator "" _puts("bar");
+  operator ""_puts("foo");
+  operator ""_puts("bar");
 }
diff --git a/clang/test/CXX/over/over.oper/over.literal/p8.cpp b/clang/test/CXX/over/over.oper/over.literal/p8.cpp
index 6644bae7e610d1..8aa31e1acd6e15 100644
--- a/clang/test/CXX/over/over.oper/over.literal/p8.cpp
+++ b/clang/test/CXX/over/over.oper/over.literal/p8.cpp
@@ -5,13 +5,13 @@ namespace std {
   using size_t = decltype(sizeof(int));
 }
 
-void operator "" _km(long double); // ok
-string operator "" _i18n(const char*, std::size_t); // ok
-template<char...> int operator "" \u03C0(); // ok, UCN for lowercase pi // expected-warning {{reserved}}
+void operator ""_km(long double); // ok
+string operator ""_i18n(const char*, std::size_t); // ok
+template<char...> int operator ""\u03C0(); // ok, UCN for lowercase pi // expected-warning {{reserved}}
 float operator ""E(const char *); // expected-error {{invalid suffix on literal}} expected-warning {{reserved}}
 float operator " " B(const char *); // expected-error {{must be '""'}} expected-warning {{reserved}}
-string operator "" 5X(const char *, std::size_t); // expected-error {{expected identifier}}
-double operator "" _miles(double); // expected-error {{parameter}}
+string operator ""5X(const char *, std::size_t); // expected-error {{expected identifier}}
+double operator ""_miles(double); // expected-error {{parameter}}
 template<char...> int operator "" j(const char*); // expected-error {{template}}
 
 float operator ""_E(const char *);
diff --git a/clang/test/FixIt/fixit-c++11.cpp b/clang/test/FixIt/fixit-c++11.cpp
index 10f4a9d0554ccc..e635c14e0c2155 100644
--- a/clang/test/FixIt/fixit-c++11.cpp
+++ b/clang/test/FixIt/fixit-c++11.cpp
@@ -74,8 +74,8 @@ const char *p = "foo"bar; // expected-error {{requires a space between}}
 #define ord - '0'
 int k = '4'ord; // expected-error {{requires a space between}}
 
-void operator"x" _y(char); // expected-error {{must be '""'}}
-void operator L"" _z(char); // expected-error {{encoding prefix}}
+void operator"x"_y(char); // expected-error {{must be '""'}}
+void operator L""_z(char); // expected-error {{encoding prefix}}
 void operator "x" "y" U"z" ""_whoops "z" "y"(char); // expected-error {{must be '""'}}
 
 void f() {
diff --git a/clang/test/Parser/cxx11-user-defined-literals.cpp b/clang/test/Parser/cxx11-user-defined-literals.cpp
index cdd06729efc39a..27a7181bc9f917 100644
--- a/clang/test/Parser/cxx11-user-defined-literals.cpp
+++ b/clang/test/Parser/cxx11-user-defined-literals.cpp
@@ -39,19 +39,19 @@ int cake() __attribute__((availability(macosx, unavailable, message = "is a lie"
 #endif
 
 // But they can appear in expressions.
-constexpr char operator"" _id(char c) { return c; }
-constexpr wchar_t operator"" _id(wchar_t c) { return c; }
-constexpr char16_t operator"" _id(char16_t c) { return c; }
-constexpr char32_t operator"" _id(char32_t c) { return c; }
+constexpr char operator""_id(char c) { return c; }
+constexpr wchar_t operator""_id(wchar_t c) { return c; }
+constexpr char16_t operator""_id(char16_t c) { return c; }
+constexpr char32_t operator""_id(char32_t c) { return c; }
 
 using size_t = decltype(sizeof(int));
-constexpr const char operator"" _id(const char *p, size_t n) { return *p; }
-constexpr const wchar_t operator"" _id(const wchar_t *p, size_t n) { return *p; }
-constexpr const char16_t operator"" _id(const char16_t *p, size_t n) { return *p; }
-constexpr const char32_t operator"" _id(const char32_t *p, size_t n) { return *p; }
+constexpr const char operator""_id(const char *p, size_t n) { return *p; }
+constexpr const wchar_t operator""_id(const wchar_t *p, size_t n) { return *p; }
+constexpr const char16_t operator""_id(const char16_t *p, size_t n) { return *p; }
+constexpr const char32_t operator""_id(const char32_t *p, size_t n) { return *p; }
 
-constexpr unsigned long long operator"" _id(unsigned long long n) { return n; }
-constexpr long double operator"" _id(long double d) { return d; }
+constexpr unsigned long long operator""_id(unsigned long long n) { return n; }
+constexpr long double operator""_id(long double d) { return d; }
 
 template<int n> struct S {};
 S<"a"_id> sa;
@@ -98,19 +98,22 @@ _no_such_suffix; // expected-error {{'operator""_no_such_suffix'}}
 // is "" in translation phase 7.
 void operator "\
 " _foo(unsigned long long); // ok
+// expected-warning@-1{{identifier '_foo' preceded by whitespace in a literal operator declaration is deprecated}}
 
 void operator R"xyzzy()xyzzy" _foo(long double); // ok
+// expected-warning@-1{{identifier '_foo' preceded by whitespace in a literal operator declaration is deprecated}}
 
 void operator"" "" R"()" "" _foo(const char *); // ok
+// expected-warning@-1{{identifier '_foo' preceded by whitespace}}
 
 void operator ""_no_space(const char *); // ok
 
 // Ensure we diagnose the bad cases.
-void operator "\0" _non_empty(const char *); // expected-error {{must be '""'}}
-void operator L"" _not_char(const char *); // expected-error {{cannot have an encoding prefix}}
+void operator "\0"_non_empty(const char *); // expected-error {{must be '""'}}
+void operator L""_not_char(const char *); // expected-error {{cannot have an encoding prefix}}
 void operator "" ""
 U"" // expected-error {{cannot have an encoding prefix}}
-"" _also_not_char(const char *);
+""_also_not_char(const char *);
 void operator "" u8"" "\u0123" "hello"_all_of_the_things ""(const char*); // expected-error {{must be '""'}}
 
 // Make sure we treat UCNs and UTF-8 as equivalent.
diff --git a/clang/test/SemaCXX/cxx11-user-defined-literals-unused.cpp b/clang/test/SemaCXX/cxx11-user-defined-literals-unused.cpp
index cd93ffbf21edf4..cf95ae7de98e94 100644
--- a/clang/test/SemaCXX/cxx11-user-defined-literals-unused.cpp
+++ b/clang/test/SemaCXX/cxx11-user-defined-literals-unused.cpp
@@ -1,8 +1,8 @@
 // RUN: %clang_cc1 -std=c++11 -verify %s -Wunused
 
 namespace {
-double operator"" _x(long double value) { return double(value); }
-int operator"" _ii(long double value) { return int(value); } // expected-warning {{not needed and will not be emitted}}
+double operator""_x(long double value) { return double(value); }
+int operator""_ii(long double value) { return int(value); } // expected-warning {{not needed and will not be emitted}}
 }
 
 namespace rdar13589856 {
diff --git a/clang/test/SemaCXX/cxx11-user-defined-literals.cpp b/clang/test/SemaCXX/cxx11-user-defined-literals.cpp
index b5d4d9976c1372..bdf88ee01c3a12 100644
--- a/clang/test/SemaCXX/cxx11-user-defined-literals.cpp
+++ b/clang/test/SemaCXX/cxx11-user-defined-literals.cpp
@@ -6,18 +6,18 @@ enum class LitKind {
   CharStr, WideStr, Char16Str, Char32Str,
   Integer, Floating, Raw, Template
 };
-constexpr LitKind operator"" _kind(char p) { return LitKind::Char; }
-constexpr LitKind operator"" _kind(wchar_t p) { return LitKind::WideChar; }
-constexpr LitKind operator"" _kind(char16_t p) { return LitKind::Char16; }
-constexpr LitKind operator"" _kind(char32_t p) { return LitKind::Char32; }
-constexpr LitKind operator"" _kind(const char *p, size_t n) { return LitKind::CharStr; }
-constexpr LitKind operator"" _kind(const wchar_t *p, size_t n) { return LitKind::WideStr; }
-constexpr LitKind operator"" _kind(const char16_t *p, size_t n) { return LitKind::Char16Str; }
-constexpr LitKind operator"" _kind(const char32_t *p, size_t n) { return LitKind::Char32Str; }
-constexpr LitKind operator"" _kind(unsigned long long n) { return LitKind::Integer; }
-constexpr LitKind operator"" _kind(long double n) { return LitKind::Floating; }
-constexpr LitKind operator"" _kind2(const char *p) { return LitKind::Raw; }
-template<char ...Cs> constexpr LitKind operator"" _kind3() { return LitKind::Template; }
+constexpr LitKind operator""_kind(char p) { return LitKind::Char; }
+constexpr LitKind operator""_kind(wchar_t p) { return LitKind::WideChar; }
+constexpr LitKind operator""_kind(char16_t p) { return LitKind::Char16; }
+constexpr LitKind operator""_kind(char32_t p) { return LitKind::Char32; }
+constexpr LitKind operator""_kind(const char *p, size_t n) { return LitKind::CharStr; }
+constexpr LitKind operator""_kind(const wchar_t *p, size_t n) { return LitKind::WideStr; }
+constexpr LitKind operator""_kind(const char16_t *p, size_t n) { return LitKind::Char16Str; }
+constexpr LitKind operator""_kind(const char32_t *p, size_t n) { return LitKind::Char32Str; }
+constexpr LitKind operator""_kind(unsigned long long n) { return LitKind::Integer; }
+constexpr LitKind operator""_kind(long double n) { return LitKind::Floating; }
+constexpr LitKind operator""_kind2(const char *p) { return LitKind::Raw; }
+template<char ...Cs> constexpr LitKind operator""_kind3() { return LitKind::Template; }
 
 static_assert('x'_kind == LitKind::Char, "");
 static_assert(L'x'_kind == LitKind::WideChar, "");
@@ -41,7 +41,7 @@ static_assert(4e6_kind3 == LitKind::Template, "");
 constexpr const char *fractional_digits_impl(const char *p) {
   return *p == '.' ? p + 1 : *p ? fractional_digits_impl(p + 1) : 0;
 }
-constexpr const char *operator"" _fractional_digits(const char *p) {
+constexpr const char *operator""_fractional_digits(const char *p) {
   return fractional_digits_impl(p) ?: p;
 }
 constexpr bool streq(const char *p, const char *q) {
@@ -56,57 +56,57 @@ static_assert(streq(1e+97_fractional_digits, "1e+97"), "");
 static_assert(streq(0377_fractional_digits, "0377"), "");
 static_assert(streq(0377.5_fractional_digits, "5"), "");
 
-int operator"" _ambiguous(char); // expected-note {{candidate}}
+int operator""_ambiguous(char); // expected-note {{candidate}}
 namespace N {
-  void *operator"" _ambiguous(char); // expected-note {{candidate}}
+  void *operator""_ambiguous(char); // expected-note {{candidate}}
 }
 using namespace N;
 int k = 'x'_ambiguous; // expected-error {{ambiguous}}
 
-int operator"" _deleted(unsigned long long) = delete; // expected-note {{here}}
+int operator""_deleted(unsigned long long) = delete; // expected-note {{here}}
 int m = 42_deleted; // expected-error {{attempt to use a deleted}}
 
 namespace Using {
   namespace M {
-    int operator"" _using(char);
+    int operator""_using(char);
   }
   int k1 = 'x'_using; // expected-error {{no matching literal operator for call to 'operator""_using'}}
 
-  using M::operator "" _using;
+  using M::operator ""_using;
   int k2 = 'x'_using;
 }
 
 namespace AmbiguousRawTemplate {
-  int operator"" _ambig1(const char *); // expected-note {{candidate}}
-  template<char...> int operator"" _ambig1(); // expected-note {{candidate}}
+  int operator""_ambig1(const char *); // expected-note {{candidate}}
+  template<char...> int operator""_ambig1(); // expected-note {{candidate}}
 
   int k1 = 123_ambig1; // expected-error {{call to 'operator""_ambig1' is ambiguous}}
 
   namespace Inner {
-    template<char...> int operator"" _ambig2(); // expected-note 3{{candidate}}
+    template<char...> int operator""_ambig2(); // expected-note 3{{candidate}}
   }
-  int operator"" _ambig2(const char *); // expected-note 3{{candidate}}
-  using Inner::operator"" _ambig2;
+  int operator""_ambig2(const char *); // expected-note 3{{candidate}}
+  using Inner::operator""_ambig2;
 
   int k2 = 123_ambig2; // expected-error {{call to 'operator""_ambig2' is ambiguous}}
 
   namespace N {
-    using Inner::operator"" _ambig2;
+    using Inner::operator""_ambig2;
 
     int k3 = 123_ambig2; // ok
 
-    using AmbiguousRawTemplate::operator"" _ambig2;
+    using AmbiguousRawTemplate::operator""_ambig2;
 
     int k4 = 123_ambig2; // expected-error {{ambiguous}}
 
     namespace M {
 
-      template<char...> int operator"" _ambig2();
+      template<char...> int operator""_ambig2();
 
       int k5 = 123_ambig2; // ok
     }
 
-    int operator"" _ambig2(unsigned long long);
+    int operator""_ambig2(unsigned long long);
 
     int k6 = 123_ambig2; // ok
     int k7 = 123._ambig2; // expected-error {{ambiguous}}
@@ -121,7 +121,7 @@ template<char C, char...Cs> constexpr unsigned hash(unsigned a) {
  return hash<Cs...>(mash(a ^ mash(C)));
 }
 template<typename T, T v> struct constant { constexpr static T value = v; };
-template<char...Cs> constexpr unsigned operator"" _hash() {
+template<char...Cs> constexpr unsigned operator""_hash() {
   return constant<unsigned, hash<Cs...>(0)>::value;
 }
 static_assert(0x1234_hash == 0x103eff5e, "");
@@ -129,7 +129,7 @@ static_assert(hash<'0', 'x', '1', '2', '3', '4'>(0) == 0x103eff5e, "");
 
 // Functions and literal suffixes go in separate namespaces.
 namespace Namespace {
-  template<char...> int operator"" _x();
+  template<char...> int operator""_x();
   int k = _x(); // expected-error {{undeclared identifier '_x'}}
 
   int _y(unsigned long long);
@@ -138,7 +138,7 @@ namespace Namespace {
 
 namespace PR14950 {
   template<...> // expected-error {{expected template parameter}}
-  int operator"" _b(); // expected-error {{no function template matches function template specialization}}
+  int operator""_b(); // expected-error {{no function template matches function template specialization}}
   int main() { return 0_b; } // expected-error {{no matching literal operator for call to 'operator""_b'}}
 }
 
diff --git a/clang/test/SemaCXX/cxx2a-consteval.cpp b/clang/test/SemaCXX/cxx2a-consteval.cpp
index ae331055c52b2e..fef4674d178412 100644
--- a/clang/test/SemaCXX/cxx2a-consteval.cpp
+++ b/clang/test/SemaCXX/cxx2a-consteval.cpp
@@ -158,17 +158,17 @@ int i3 = f1(f1(f1(&f1, &f1), f1(&f1, &f1), f1(f1(&f1, &f1), &f1)));
 
 namespace user_defined_literal {
 
-consteval int operator"" _test(unsigned long long i) {
+consteval int operator""_test(unsigned long long i) {
 // expected-note@-1+ {{declared here}}
   return 0;
 }
 
 int i = 0_test;
 
-auto ptr = &operator"" _test;
+auto ptr = &operator""_test;
 // expected-error@-1 {{take address}}
 
-consteval auto operator"" _test1(unsigned long long i) {
+consteval auto operator""_test1(unsigned long long i) {
   return &f_eval;
 }
 
diff --git a/clang/test/SemaCXX/cxx98-compat.cpp b/clang/test/SemaCXX/cxx98-compat.cpp
index 3ce69d6908c5b2..28547d42c64902 100644
--- a/clang/test/SemaCXX/cxx98-compat.cpp
+++ b/clang/test/SemaCXX/cxx98-compat.cpp
@@ -84,7 +84,7 @@ struct DelayedDefaultArgumentParseInitList {
   }
 };
 
-int operator"" _hello(const char *); // expected-warning {{literal operators are incompatible with C++98}}
+int operator""_hello(const char *); // expected-warning {{literal operators are incompatible with C++98}}
 
 enum EnumFixed : int { // expected-warning {{enumeration types with a fixed underlying type are incompatible with C++98}}
 };
diff --git a/clang/test/SemaCXX/literal-operators.cpp b/clang/test/SemaCXX/literal-operators.cpp
index 067e151606202e..2a41e048914b78 100644
--- a/clang/test/SemaCXX/literal-operators.cpp
+++ b/clang/test/SemaCXX/literal-operators.cpp
@@ -3,46 +3,46 @@
 #include <stddef.h>
 
 struct tag {
-  void operator "" _tag_bad (const char *); // expected-error {{literal operator 'operator""_tag_bad' must be in a namespace or global scope}}
-  friend void operator "" _tag_good (const char *);
+  void operator ""_tag_bad (const char *); // expected-error {{literal operator 'operator""_tag_bad' must be in a namespace or global scope}}
+  friend void operator ""_tag_good (const char *);
 };
 
-namespace ns { void operator "" _ns_good (const char *); }
+namespace ns { void operator ""_ns_good (const char *); }
 
 // Check extern "C++" declarations
-extern "C++" void operator "" _extern_good (const char *);
-extern "C++" { void operator "" _extern_good (const char *); }
+extern "C++" void operator ""_extern_good (const char *);
+extern "C++" { void operator ""_extern_good (const char *); }
 
-void fn () { void operator "" _fn_good (const char *); }
+void fn () { void operator ""_fn_good (const char *); }
 
 // One-param declarations (const char * was already checked)
-void operator "" _good (char);
-void operator "" _good (wchar_t);
-void operator "" _good (char16_t);
-void operator "" _good (char32_t);
-void operator "" _good (unsigned long long);
-void operator "" _good (long double);
+void operator ""_good (char);
+void operator ""_good (wchar_t);
+void operator ""_good (char16_t);
+void operator ""_good (char32_t);
+void operator ""_good (unsigned long long);
+void operator ""_good (long double);
 
 // Two-param declarations
-void operator "" _good (const char *, size_t);
-void operator "" _good (const wchar_t *, size_t);
-void operator "" _good (const char16_t *, size_t);
-void operator "" _good (const char32_t *, size_t);
+void operator ""_good (const char *, size_t);
+void operator ""_good (const wchar_t *, size_t);
+void operator ""_good (const char16_t *, size_t);
+void operator ""_good (const char32_t *, size_t);
 
 // Check typedef and array equivalences
-void operator "" _good (const char[]);
+void operator ""_good (const char[]);
 typedef const char c;
-void operator "" _good (c*);
+void operator ""_good (c*);
 
 // Check extra cv-qualifiers
-void operator "" _cv_good (volatile const char *, const size_t); // expected-error {{invalid literal operator parameter type 'const volatile char *', did you mean 'const char *'?}}
+void operator ""_cv_good (volatile const char *, const size_t); // expected-error {{invalid literal operator parameter type 'const volatile char *', did you mean 'const char *'?}}
 
 // Template declaration
-template <char...> void operator "" _good ();
+template <char...> void operator ""_good ();
 
-template <typename...> void operator "" _invalid(); // expected-error {{template parameter list for literal operator must be either 'char...' or 'typename T, T...'}}
-template <wchar_t...> void operator "" _invalid();  // expected-error {{template parameter list for literal operator must be either 'char...' or 'typename T, T...'}}
-template <unsigned long long...> void operator "" _invalid();  // expected-error {{template parameter list for literal operator must be either 'char...' or 'typename T, T...'}}
+template <typename...> void operator ""_invalid(); // expected-error {{template parameter list for literal operator must be either 'char...' or 'typename T, T...'}}
+template <wchar_t...> void operator ""_invalid();  // expected-error {{template parameter list for literal operator must be either 'char...' or 'typename T, T...'}}
+template <unsigned long long...> void operator ""_invalid();  // expected-error {{template parameter list for literal operator must be either 'char...' or 'typename T, T...'}}
 
 _Complex float operator""if(long double); // expected-warning {{reserved}}
 _Complex float test_if_1() { return 2.0f + 1.5if; };
diff --git a/clang/test/SemaCXX/no-warn-user-defined-literals-in-system-headers.cpp b/clang/test/SemaCXX/no-warn-user-defined-literals-in-system-headers.cpp
index ee5b0c47b9ffcd..61b08954c40b58 100644
--- a/clang/test/SemaCXX/no-warn-user-defined-literals-in-system-headers.cpp
+++ b/clang/test/SemaCXX/no-warn-user-defined-literals-in-system-headers.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 -Wsystem-headers -isystem %S %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 -isystem %S %s
 
 #include <no-warn-user-defined-literals-in-system-headers.h>
 
diff --git a/clang/test/SemaCXX/reserved-identifier.cpp b/clang/test/SemaCXX/reserved-identifier.cpp
index eaa5fe8330337e..a3e6d8ead61fd8 100644
--- a/clang/test/SemaCXX/reserved-identifier.cpp
+++ b/clang/test/SemaCXX/reserved-identifier.cpp
@@ -76,7 +76,8 @@ namespace {
 int _barbatruc; // no-warning
 }
 
-long double operator"" _BarbeBleue(long double) // expected-warning {{identifier '_BarbeBleue' is reserved because it starts with '_' followed by a capital letter}}
+long double operator"" _BarbeBleue(long double) // expected-warning {{identifier '_BarbeBleue' is reserved because it starts with '_' followed by a capital letter}}\
+                                                // expected-warning {{identifier '_BarbeBleue' preceded by whitespace in a literal operator declaration is deprecated}}
 {
   return 0.;
 }
@@ -86,10 +87,11 @@ long double operator""_SacreBleu(long double) // no-warning
   return 0.;
 }
 
-long double sacrebleu = operator"" _SacreBleu(1.2); // expected-warning {{identifier '_SacreBleu' is reserved because it starts with '_' followed by a capital letter}}
+long double sacrebleu = operator"" _SacreBleu(1.2); // expected-warning {{identifier '_SacreBleu' is reserved because it starts with '_' followed by a capital letter}} \
+                                                    // expected-warning {{identifier '_SacreBleu' preceded by whitespace in a literal operator declaration is deprecated}}
 long double sangbleu = operator""_SacreBleu(1.2);   // no-warning
 
-void operator"" _lowercase(unsigned long long); // no-warning
+void operator"" _lowercase(unsigned long long); // expected-warning {{identifier '_lowercase' preceded by whitespace in a literal operator declaration is deprecated}}
 void operator""_lowercase(unsigned long long); // no-warning
 
 struct _BarbeRouge { // expected-warning {{identifier '_BarbeRouge' is reserved because it starts with '_' followed by a capital letter}}
diff --git a/clang/test/SemaCXX/warn-xor-as-pow.cpp b/clang/test/SemaCXX/warn-xor-as-pow.cpp
index 88719bd1a9d775..ade977f97e4366 100644
--- a/clang/test/SemaCXX/warn-xor-as-pow.cpp
+++ b/clang/test/SemaCXX/warn-xor-as-pow.cpp
@@ -19,10 +19,10 @@
 #define flexor 7
 
 #ifdef __cplusplus
-constexpr long long operator"" _xor(unsigned long long v) { return v; }
+constexpr long long operator""_xor(unsigned long long v) { return v; }
 
-constexpr long long operator"" _0b(unsigned long long v) { return v; }
-constexpr long long operator"" _0X(unsigned long long v) { return v; }
+constexpr long long operator""_0b(unsigned long long v) { return v; }
+constexpr long long operator""_0X(unsigned long long v) { return v; }
 #else
 #define xor ^ // iso646.h
 #endif

From a2bd5db00d07bc28fcfb3350b9d50e72e6350cba Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Fri, 11 Oct 2024 14:14:04 +0100
Subject: [PATCH 170/177] [lldb] Fix finding make tool for tests (#111980)

Fixes 0e913237871e8c9290e82be30be8b3484952eee0 /
https://github.com/llvm/llvm-project/pull/111531

For reasons I can't explain, a clean build works fine for me, and all
the bots are working fine. But if I rebuild in some way the make tool
becomes None.

Looking at the other variables, they had these extra lines so I've added
those for make and it seems to solve the problem.
---
 lldb/utils/lldb-dotest/CMakeLists.txt | 1 +
 lldb/utils/lldb-dotest/lldb-dotest.in | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/lldb/utils/lldb-dotest/CMakeLists.txt b/lldb/utils/lldb-dotest/CMakeLists.txt
index bb17a2ce017d5d..3b8c88b6dc78cb 100644
--- a/lldb/utils/lldb-dotest/CMakeLists.txt
+++ b/lldb/utils/lldb-dotest/CMakeLists.txt
@@ -34,6 +34,7 @@ set(vars
   LLDB_TEST_EXECUTABLE
   LLDB_TEST_COMPILER
   LLDB_TEST_DSYMUTIL
+  LLDB_TEST_MAKE
   LLDB_LIBS_DIR
   LLVM_TOOLS_DIR
   LIBCXX_LIBRARY_DIR
diff --git a/lldb/utils/lldb-dotest/lldb-dotest.in b/lldb/utils/lldb-dotest/lldb-dotest.in
index 0e9648a6e6dc83..f20859e87b1e0b 100755
--- a/lldb/utils/lldb-dotest/lldb-dotest.in
+++ b/lldb/utils/lldb-dotest/lldb-dotest.in
@@ -9,6 +9,7 @@ arch = '@LLDB_TEST_ARCH@'
 executable = '@LLDB_TEST_EXECUTABLE_CONFIGURED@'
 compiler = '@LLDB_TEST_COMPILER_CONFIGURED@'
 dsymutil = '@LLDB_TEST_DSYMUTIL_CONFIGURED@'
+make = '@LLDB_TEST_MAKE_CONFIGURED@'
 lldb_build_dir = '@LLDB_TEST_BUILD_DIRECTORY_CONFIGURED@'
 lldb_build_intel_pt = "@LLDB_BUILD_INTEL_PT@"
 lldb_framework_dir = "@LLDB_FRAMEWORK_DIR_CONFIGURED@"
@@ -35,6 +36,7 @@ if __name__ == '__main__':
     cmd.extend(['--executable', executable])
     cmd.extend(['--compiler', compiler])
     cmd.extend(['--dsymutil', dsymutil])
+    cmd.extend(['--make', make])
     cmd.extend(['--lldb-libs-dir', lldb_libs_dir])
     cmd.extend(['--llvm-tools-dir', llvm_tools_dir])
     if has_libcxx:

From c3a10dc8498b1e501f5a32b082b63b0c1fc499a5 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Fri, 11 Oct 2024 14:15:01 +0100
Subject: [PATCH 171/177] [AArch64] Disable consecutive store merging when Neon
 is unavailable (#111519)

Lowering fixed-size BUILD_VECTORS without Neon may introduce stack
spills, leading to more stores/reloads than if the stores were not
merged. In some cases, it can also prevent using paired store
instructions.

In the future, we may want to relax when SVE is available, but
currently, the SVE lowerings for BUILD_VECTOR are limited to a few
specific cases.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 15 +++
 llvm/lib/Target/AArch64/AArch64ISelLowering.h | 11 +--
 .../AArch64/consecutive-stores-of-faddv.ll    | 92 +++++++++++++++++++
 3 files changed, 108 insertions(+), 10 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/consecutive-stores-of-faddv.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index ae96e277b5fc69..edd08fe3b2f3cf 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -27519,6 +27519,21 @@ bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
   return OptSize && !VT.isVector();
 }
 
+bool AArch64TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
+                                             const MachineFunction &MF) const {
+  // Avoid merging stores into fixed-length vectors when Neon is unavailable.
+  // In future, we could allow this when SVE is available, but currently,
+  // the SVE lowerings for BUILD_VECTOR are limited to a few specific cases (and
+  // the general lowering may introduce stack spills/reloads).
+  if (MemVT.isFixedLengthVector() && !Subtarget->isNeonAvailable())
+    return false;
+
+  // Do not merge to float value size (128 bytes) if no implicit float attribute
+  // is set.
+  bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
+  return !NoFloat || MemVT.getSizeInBits() <= 64;
+}
+
 bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
   // We want inc-of-add for scalars and sub-of-not for vectors.
   return VT.isScalarInteger();
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 035a802cd49b3c..cf2ae5fd027c7a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -846,16 +846,7 @@ class AArch64TargetLowering : public TargetLowering {
   bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
 
   bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
-                        const MachineFunction &MF) const override {
-    // Do not merge to float value size (128 bytes) if no implicit
-    // float attribute is set.
-
-    bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
-
-    if (NoFloat)
-      return (MemVT.getSizeInBits() <= 64);
-    return true;
-  }
+                        const MachineFunction &MF) const override;
 
   bool isCheapToSpeculateCttz(Type *) const override {
     return true;
diff --git a/llvm/test/CodeGen/AArch64/consecutive-stores-of-faddv.ll b/llvm/test/CodeGen/AArch64/consecutive-stores-of-faddv.ll
new file mode 100644
index 00000000000000..64482e15aed81e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/consecutive-stores-of-faddv.ll
@@ -0,0 +1,92 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc  -mtriple=aarch64-linux-gnu -mattr=+sve,+sme -O3 < %s -o - | FileCheck %s --check-prefixes=CHECK
+
+; Tests consecutive stores of @llvm.aarch64.sve.faddv. Within SDAG faddv is
+; lowered as a FADDV + EXTRACT_VECTOR_ELT (of lane 0). Stores of extracts can
+; be matched by DAGCombiner::mergeConsecutiveStores(), which we want to avoid in
+; some cases as it can lead to worse codegen.
+
+; TODO: A single `stp s0, s1, [x0]` may be preferred here.
+define void @consecutive_stores_pair(ptr %dest0, <vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1) {
+; CHECK-LABEL: consecutive_stores_pair:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    faddv s0, p0, z0.s
+; CHECK-NEXT:    faddv s1, p0, z1.s
+; CHECK-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    ret
+  %dest1 = getelementptr inbounds i8, ptr %dest0, i64 4
+  %reduce0 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec0)
+  %reduce1 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec1)
+  store float %reduce0, ptr %dest0, align 4
+  store float %reduce1, ptr %dest1, align 4
+  ret void
+}
+
+define void @consecutive_stores_quadruple(ptr %dest0, <vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1, <vscale x 4 x float> %vec2, <vscale x 4 x float> %vec3) {
+; CHECK-LABEL: consecutive_stores_quadruple:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    faddv s0, p0, z0.s
+; CHECK-NEXT:    faddv s1, p0, z1.s
+; CHECK-NEXT:    faddv s2, p0, z2.s
+; CHECK-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-NEXT:    faddv s3, p0, z3.s
+; CHECK-NEXT:    mov v2.s[1], v3.s[0]
+; CHECK-NEXT:    stp d0, d2, [x0]
+; CHECK-NEXT:    ret
+  %dest1 = getelementptr inbounds i8, ptr %dest0, i64 4
+  %dest2 = getelementptr inbounds i8, ptr %dest1, i64 4
+  %dest3 = getelementptr inbounds i8, ptr %dest2, i64 4
+  %reduce0 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec0)
+  %reduce1 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec1)
+  %reduce2 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec2)
+  %reduce3 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec3)
+  store float %reduce0, ptr %dest0, align 4
+  store float %reduce1, ptr %dest1, align 4
+  store float %reduce2, ptr %dest2, align 4
+  store float %reduce3, ptr %dest3, align 4
+  ret void
+}
+
+define void @consecutive_stores_pair_streaming_function(ptr %dest0, <vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1) "aarch64_pstate_sm_enabled"  {
+; CHECK-LABEL: consecutive_stores_pair_streaming_function:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    faddv s0, p0, z0.s
+; CHECK-NEXT:    faddv s1, p0, z1.s
+; CHECK-NEXT:    stp s0, s1, [x0]
+; CHECK-NEXT:    ret
+  %dest1 = getelementptr inbounds i8, ptr %dest0, i64 4
+  %reduce0 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec0)
+  %reduce1 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec1)
+  store float %reduce0, ptr %dest0, align 4
+  store float %reduce1, ptr %dest1, align 4
+  ret void
+}
+
+define void @consecutive_stores_quadruple_streaming_function(ptr %dest0, <vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1, <vscale x 4 x float> %vec2, <vscale x 4 x float> %vec3) "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: consecutive_stores_quadruple_streaming_function:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    faddv s0, p0, z0.s
+; CHECK-NEXT:    faddv s1, p0, z1.s
+; CHECK-NEXT:    faddv s2, p0, z2.s
+; CHECK-NEXT:    stp s0, s1, [x0]
+; CHECK-NEXT:    faddv s3, p0, z3.s
+; CHECK-NEXT:    stp s2, s3, [x0, #8]
+; CHECK-NEXT:    ret
+  %dest1 = getelementptr inbounds i8, ptr %dest0, i64 4
+  %dest2 = getelementptr inbounds i8, ptr %dest1, i64 4
+  %dest3 = getelementptr inbounds i8, ptr %dest2, i64 4
+  %reduce0 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec0)
+  %reduce1 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec1)
+  %reduce2 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec2)
+  %reduce3 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec3)
+  store float %reduce0, ptr %dest0, align 4
+  store float %reduce1, ptr %dest1, align 4
+  store float %reduce2, ptr %dest2, align 4
+  store float %reduce3, ptr %dest3, align 4
+  ret void
+}

From bd12729a828c653da53f7182dda29982123913db Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20=C3=81lvarez=20Ayll=C3=B3n?=
 <alejandro.alvarez@sonarsource.com>
Date: Fri, 11 Oct 2024 15:23:47 +0200
Subject: [PATCH 172/177] [clang] Ignore inline namespace for `hasName`
 (#109147)

Add a new enumeration `SuppressInlineNamespaceMode` to `PrintingPolicy` that
is explicit about how to handle inline namespaces. `SuppressInlineNamespace`
uses that enumeration now instead of a Boolean value.

Specializing a template from an inline namespace should be transparent.
For instance

```
namespace foo {
    inline namespace v1 {
        template<typename A>
        void function(A&);
    }
}

namespace foo {
    template<>
    void function<int>(int&);
}
```

`hasName` should match both declarations of `foo::function`.

Makes the behavior of `matchesNodeFullSlow` and `matchesNodeFullFast`
consistent, fixing an assert inside `HasNameMatcher::matchesNode`.
---
 clang/docs/ReleaseNotes.rst                   |  3 ++
 clang/include/clang/AST/PrettyPrinter.h       | 18 +++++++-----
 clang/lib/AST/Decl.cpp                        | 14 +++++++--
 clang/lib/AST/TypePrinter.cpp                 |  4 ++-
 clang/lib/ASTMatchers/ASTMatchersInternal.cpp |  4 ++-
 clang/lib/CodeGen/CGDebugInfo.cpp             |  3 +-
 clang/lib/CodeGen/CodeGenTypes.cpp            |  3 +-
 .../ASTMatchers/ASTMatchersNarrowingTest.cpp  | 29 +++++++++++++++++++
 8 files changed, 64 insertions(+), 14 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 7063fea41efda1..763bc3ac159322 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -647,6 +647,9 @@ AST Matchers
 
 - Fixed a crash when traverse lambda expr with invalid captures. (#GH106444)
 
+- Ensure ``hasName`` matches template specializations across inline namespaces,
+  making `matchesNodeFullSlow` and `matchesNodeFullFast` consistent.
+
 clang-format
 ------------
 
diff --git a/clang/include/clang/AST/PrettyPrinter.h b/clang/include/clang/AST/PrettyPrinter.h
index 332ac3c6a004a9..91818776b770cf 100644
--- a/clang/include/clang/AST/PrettyPrinter.h
+++ b/clang/include/clang/AST/PrettyPrinter.h
@@ -55,15 +55,17 @@ class PrintingCallbacks {
 /// This type is intended to be small and suitable for passing by value.
 /// It is very frequently copied.
 struct PrintingPolicy {
+  enum SuppressInlineNamespaceMode : uint8_t { None, Redundant, All };
+
   /// Create a default printing policy for the specified language.
   PrintingPolicy(const LangOptions &LO)
       : Indentation(2), SuppressSpecifiers(false),
         SuppressTagKeyword(LO.CPlusPlus), IncludeTagDefinition(false),
         SuppressScope(false), SuppressUnwrittenScope(false),
-        SuppressInlineNamespace(true), SuppressElaboration(false),
-        SuppressInitializers(false), ConstantArraySizeAsWritten(false),
-        AnonymousTagLocations(true), SuppressStrongLifetime(false),
-        SuppressLifetimeQualifiers(false),
+        SuppressInlineNamespace(SuppressInlineNamespaceMode::Redundant),
+        SuppressElaboration(false), SuppressInitializers(false),
+        ConstantArraySizeAsWritten(false), AnonymousTagLocations(true),
+        SuppressStrongLifetime(false), SuppressLifetimeQualifiers(false),
         SuppressTemplateArgsInCXXConstructors(false),
         SuppressDefaultTemplateArgs(true), Bool(LO.Bool),
         Nullptr(LO.CPlusPlus11 || LO.C23), NullptrTypeInNamespace(LO.CPlusPlus),
@@ -141,10 +143,12 @@ struct PrintingPolicy {
   unsigned SuppressUnwrittenScope : 1;
 
   /// Suppress printing parts of scope specifiers that correspond
-  /// to inline namespaces, where the name is unambiguous with the specifier
+  /// to inline namespaces.
+  /// If Redudant, where the name is unambiguous with the specifier removed.
+  /// If All, even if the name is ambiguous with the specifier
   /// removed.
-  LLVM_PREFERRED_TYPE(bool)
-  unsigned SuppressInlineNamespace : 1;
+  LLVM_PREFERRED_TYPE(SuppressInlineNamespaceMode)
+  unsigned SuppressInlineNamespace : 2;
 
   /// Ignore qualifiers and tag keywords as specified by elaborated type sugar,
   /// instead letting the underlying type print as normal.
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index 84ef9f74582ef6..a2f5b4183bbd23 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -1737,9 +1737,17 @@ void NamedDecl::printNestedNameSpecifier(raw_ostream &OS,
       continue;
 
     // Suppress inline namespace if it doesn't make the result ambiguous.
-    if (P.SuppressInlineNamespace && Ctx->isInlineNamespace() && NameInScope &&
-        cast<NamespaceDecl>(Ctx)->isRedundantInlineQualifierFor(NameInScope))
-      continue;
+    if (Ctx->isInlineNamespace() && NameInScope) {
+      bool isRedundant =
+          cast<NamespaceDecl>(Ctx)->isRedundantInlineQualifierFor(NameInScope);
+      if (P.SuppressInlineNamespace ==
+              PrintingPolicy::SuppressInlineNamespaceMode::All ||
+          (P.SuppressInlineNamespace ==
+               PrintingPolicy::SuppressInlineNamespaceMode::Redundant &&
+           isRedundant)) {
+        continue;
+      }
+    }
 
     // Skip non-named contexts such as linkage specifications and ExportDecls.
     const NamedDecl *ND = dyn_cast<NamedDecl>(Ctx);
diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp
index ca75bb97c158e1..008e87e7e5c14b 100644
--- a/clang/lib/AST/TypePrinter.cpp
+++ b/clang/lib/AST/TypePrinter.cpp
@@ -1413,7 +1413,9 @@ void TypePrinter::AppendScope(DeclContext *DC, raw_ostream &OS,
 
     // Only suppress an inline namespace if the name has the same lookup
     // results in the enclosing namespace.
-    if (Policy.SuppressInlineNamespace && NS->isInline() && NameInScope &&
+    if (Policy.SuppressInlineNamespace !=
+            PrintingPolicy::SuppressInlineNamespaceMode::None &&
+        NS->isInline() && NameInScope &&
         NS->isRedundantInlineQualifierFor(NameInScope))
       return AppendScope(DC->getParent(), OS, NameInScope);
 
diff --git a/clang/lib/ASTMatchers/ASTMatchersInternal.cpp b/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
index 06309d327896b3..46dd44e6f2b24f 100644
--- a/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
+++ b/clang/lib/ASTMatchers/ASTMatchersInternal.cpp
@@ -655,7 +655,9 @@ bool HasNameMatcher::matchesNodeFullSlow(const NamedDecl &Node) const {
 
     PrintingPolicy Policy = Node.getASTContext().getPrintingPolicy();
     Policy.SuppressUnwrittenScope = SkipUnwritten;
-    Policy.SuppressInlineNamespace = SkipUnwritten;
+    Policy.SuppressInlineNamespace =
+        SkipUnwritten ? PrintingPolicy::SuppressInlineNamespaceMode::All
+                      : PrintingPolicy::SuppressInlineNamespaceMode::None;
     Node.printQualifiedName(OS, Policy);
 
     const StringRef FullName = OS.str();
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index 609957b75d6e7e..06015a9e541ea2 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -287,7 +287,8 @@ PrintingPolicy CGDebugInfo::getPrintingPolicy() const {
     PP.SplitTemplateClosers = true;
   }
 
-  PP.SuppressInlineNamespace = false;
+  PP.SuppressInlineNamespace =
+      PrintingPolicy::SuppressInlineNamespaceMode::None;
   PP.PrintCanonicalTypes = true;
   PP.UsePreferredNames = false;
   PP.AlwaysIncludeTypeForTemplateArgument = true;
diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp
index 0b486a644f57b1..339632090a5b71 100644
--- a/clang/lib/CodeGen/CodeGenTypes.cpp
+++ b/clang/lib/CodeGen/CodeGenTypes.cpp
@@ -60,7 +60,8 @@ void CodeGenTypes::addRecordTypeName(const RecordDecl *RD,
   // example, we should probably enable PrintCanonicalTypes and
   // FullyQualifiedNames.
   PrintingPolicy Policy = RD->getASTContext().getPrintingPolicy();
-  Policy.SuppressInlineNamespace = false;
+  Policy.SuppressInlineNamespace =
+      PrintingPolicy::SuppressInlineNamespaceMode::None;
 
   // Name the codegen type after the typedef name
   // if there is no tag type name available
diff --git a/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp b/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp
index 611e1f9ba5327c..d696375547acce 100644
--- a/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp
+++ b/clang/unittests/ASTMatchers/ASTMatchersNarrowingTest.cpp
@@ -2599,6 +2599,35 @@ TEST_P(ASTMatchersTest, HasName_MatchesInlinedNamespaces) {
   EXPECT_TRUE(matches(code, recordDecl(hasName("::a::C"))));
 }
 
+TEST_P(ASTMatchersTest, HasName_MatchesSpecializedInlinedNamespace) {
+  if (!GetParam().isCXX11OrLater()) {
+    return;
+  }
+
+  StringRef code = R"(
+namespace a {
+    inline namespace v1 {
+        template<typename T> T foo(T);
+    }
+}
+
+namespace a {
+    enum Tag{T1, T2};
+
+    template <Tag, typename T> T foo(T);
+}
+
+auto v1 = a::foo(1);
+auto v2 = a::foo<a::T1>(1);
+)";
+  EXPECT_TRUE(matches(
+      code, varDecl(hasName("v1"), hasDescendant(callExpr(callee(
+                                       functionDecl(hasName("::a::foo"))))))));
+  EXPECT_TRUE(matches(
+      code, varDecl(hasName("v2"), hasDescendant(callExpr(callee(
+                                       functionDecl(hasName("::a::foo"))))))));
+}
+
 TEST_P(ASTMatchersTest, HasName_MatchesAnonymousNamespaces) {
   if (!GetParam().isCXX()) {
     return;

From 923fef903853d61ccef39e8ac770693bf145207f Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Fri, 11 Oct 2024 09:27:22 -0400
Subject: [PATCH 173/177] [libc++abi] Remove unused LIBCXXABI_LIBCXX_INCLUDES
 CMake option (#111824)

This hasn't been used for several years, so it's effectively dead code
at this point.
---
 libcxxabi/CMakeLists.txt | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/libcxxabi/CMakeLists.txt b/libcxxabi/CMakeLists.txt
index 43400c6e8d9af1..ac1ee69d5f11c9 100644
--- a/libcxxabi/CMakeLists.txt
+++ b/libcxxabi/CMakeLists.txt
@@ -131,11 +131,6 @@ if (NOT LIBCXXABI_ENABLE_SHARED AND NOT LIBCXXABI_ENABLE_STATIC)
   message(FATAL_ERROR "libc++abi must be built as either a shared or static library.")
 endif()
 
-# TODO: Remove this, which shouldn't be necessary since we know we're being built
-#       side-by-side with libc++.
-set(LIBCXXABI_LIBCXX_INCLUDES "" CACHE PATH
-    "Specify path to libc++ includes.")
-
 set(LIBCXXABI_HERMETIC_STATIC_LIBRARY_DEFAULT OFF)
 if (WIN32)
   set(LIBCXXABI_HERMETIC_STATIC_LIBRARY_DEFAULT ON)

From 33c14f19656c751bbbc083e4a168ab898e583bfd Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Fri, 11 Oct 2024 09:29:47 -0400
Subject: [PATCH 174/177] [libc++] Add output groups to run-buildbot (#111739)

This improves the CI output by providing collapsable sections for
sub-parts of our build.

This was originally opened as #75233.

Co-authored-by: eric <eric@efcs.ca>
---
 libcxx/utils/ci/run-buildbot | 107 ++++++++++++++++++++++++-----------
 1 file changed, 73 insertions(+), 34 deletions(-)

diff --git a/libcxx/utils/ci/run-buildbot b/libcxx/utils/ci/run-buildbot
index 0ce1def5f37224..0b72f8c4031031 100755
--- a/libcxx/utils/ci/run-buildbot
+++ b/libcxx/utils/ci/run-buildbot
@@ -7,7 +7,7 @@
 #
 # ===----------------------------------------------------------------------===##
 
-set -ex
+set -e
 set -o pipefail
 unset LANG
 unset LC_ALL
@@ -96,12 +96,37 @@ if [ -z "${CMAKE}" ]; then
     fi
 fi
 
+function step() {
+  endstep
+  set +x
+  if [[ ! -z ${GITHUB_ACTIONS+x} ]]; then
+    echo "::group::$1"
+    export IN_GROUP=1
+  else
+    echo "--- $1"
+  fi
+  set -x
+}
+
+function endstep() {
+  set +x
+  if [[ ! -z ${GITHUB_ACTIONS+x} ]] && [[ ! -z ${IN_GROUP+x} ]]; then
+    echo "::endgroup::"
+    unset IN_GROUP
+  fi
+  set -x
+}
+
+function error() {
+    echo "::error::$1"
+}
+
 function clean() {
     rm -rf "${BUILD_DIR}"
 }
 
 function generate-cmake-base() {
-    echo "--- Generating CMake"
+    step "Generating CMake"
     ${CMAKE} \
           -S "${MONOREPO_ROOT}/runtimes" \
           -B "${BUILD_DIR}" \
@@ -138,29 +163,32 @@ function generate-cmake-android() {
 }
 
 function check-runtimes() {
-    echo "+++ Running the libc++ tests"
+    step "Building libc++ test dependencies"
+    ${NINJA} -vC "${BUILD_DIR}" cxx-test-depends
+
+    step "Running the libc++ tests"
     ${NINJA} -vC "${BUILD_DIR}" check-cxx
 
-    echo "+++ Running the libc++abi tests"
+    step "Running the libc++abi tests"
     ${NINJA} -vC "${BUILD_DIR}" check-cxxabi
 
-    echo "+++ Running the libunwind tests"
+    step "Running the libunwind tests"
     ${NINJA} -vC "${BUILD_DIR}" check-unwind
 }
 
 # TODO: The goal is to test this against all configurations. We should also move
 #       this to the Lit test suite instead of being a separate CMake target.
 function check-abi-list() {
-    echo "+++ Running the libc++ ABI list test"
+    step "Running the libc++ ABI list test"
     ${NINJA} -vC "${BUILD_DIR}" check-cxx-abilist || (
-        echo "+++ Generating the libc++ ABI list after failed check"
+        error "Generating the libc++ ABI list after failed check"
         ${NINJA} -vC "${BUILD_DIR}" generate-cxx-abilist
         false
     )
 }
 
 function check-cxx-benchmarks() {
-    echo "--- Running the benchmarks"
+    step "Running the benchmarks"
     ${NINJA} -vC "${BUILD_DIR}" check-cxx-benchmarks
 }
 
@@ -170,12 +198,13 @@ function test-armv7m-picolibc() {
     # To make it easier to get this builder up and running, build picolibc
     # from scratch. Anecdotally, the build-picolibc script takes about 16 seconds.
     # This could be optimised by building picolibc into the Docker container.
+    step "Building picolibc from source"
     ${MONOREPO_ROOT}/libcxx/utils/ci/build-picolibc.sh \
         --build-dir "${BUILD_DIR}" \
         --install-dir "${INSTALL_DIR}" \
         --target armv7m-none-eabi
 
-    echo "--- Generating CMake"
+    step "Generating CMake for compiler-rt"
     flags="--sysroot=${INSTALL_DIR}"
     ${CMAKE} \
         -S "${MONOREPO_ROOT}/compiler-rt" \
@@ -187,6 +216,8 @@ function test-armv7m-picolibc() {
         -DCMAKE_CXX_FLAGS="${flags}" \
         -DLLVM_ENABLE_PER_TARGET_RUNTIME_DIR=ON \
         "${@}"
+
+    step "Generating CMake for libc++"
     generate-cmake \
         -DLIBCXX_TEST_CONFIG="armv7m-picolibc-libc++.cfg.in" \
         -DLIBCXXABI_TEST_CONFIG="armv7m-picolibc-libc++abi.cfg.in" \
@@ -195,6 +226,7 @@ function test-armv7m-picolibc() {
         -DCMAKE_CXX_FLAGS="${flags}" \
         "${@}"
 
+    step "Installing compiler-rt"
     ${NINJA} -vC "${BUILD_DIR}/compiler-rt" install
 
     # Prior to clang 19, armv7m-none-eabi normalised to armv7m-none-unknown-eabi.
@@ -208,9 +240,9 @@ function test-armv7m-picolibc() {
 }
 
 # Print the version of a few tools to aid diagnostics in some cases
+step "Diagnose tools in use"
 ${CMAKE} --version
 ${NINJA} --version
-
 if [ ! -z "${CXX}" ]; then ${CXX} --version; fi
 
 case "${BUILDER}" in
@@ -220,10 +252,9 @@ check-generated-output)
     clean
     generate-cmake
 
-    set +x # Printing all the commands below just creates extremely confusing output
-
     # Reject patches that forgot to re-run the generator scripts.
-    echo "+++ Making sure the generator scripts were run"
+    step "Making sure the generator scripts were run"
+    set +x # Printing all the commands below just creates extremely confusing output
     ${NINJA} -vC "${BUILD_DIR}" libcxx-generate-files
     git diff | tee ${BUILD_DIR}/generated_output.patch
     git ls-files -o --exclude-standard | tee ${BUILD_DIR}/generated_output.status
@@ -235,9 +266,8 @@ check-generated-output)
         false
     fi
 
-    # Reject patches that introduce non-ASCII characters or hard tabs.
-    # Depends on LC_COLLATE set at the top of this script.
-    set -x
+    # This depends on LC_COLLATE set at the top of this script.
+    step "Reject patches that introduce non-ASCII characters or hard tabs."
     ! grep -rn '[^ -~]' libcxx/include libcxx/src libcxx/test \
            --exclude '*.dat' \
            --exclude '*unicode*.cpp' \
@@ -345,7 +375,7 @@ generic-ubsan)
 bootstrapping-build)
     clean
 
-    echo "--- Generating CMake"
+    step "Generating CMake"
     ${CMAKE} \
           -S "${MONOREPO_ROOT}/llvm" \
           -B "${BUILD_DIR}" \
@@ -362,14 +392,14 @@ bootstrapping-build)
           -DLLVM_ENABLE_ASSERTIONS=ON \
           -DLLVM_LIT_ARGS="-sv --xunit-xml-output test-results.xml --timeout=1500 --time-tests"
 
-    echo "+++ Running the LLDB libc++ data formatter tests"
+    step "Running the LLDB libc++ data formatter tests"
     ${NINJA} -vC "${BUILD_DIR}" lldb-api-test-deps
     ${BUILD_DIR}/bin/llvm-lit -sv --param dotest-args='--category libc++' "${MONOREPO_ROOT}/lldb/test/API"
 
-    echo "--- Running the libc++ and libc++abi tests"
+    step "Running the libc++ and libc++abi tests"
     ${NINJA} -vC "${BUILD_DIR}" check-runtimes
 
-    echo "+++ Installing libc++ and libc++abi to a fake location"
+    step "Installing libc++ and libc++abi to a fake location"
     ${NINJA} -vC "${BUILD_DIR}" install-runtimes
 
     ccache -s
@@ -502,6 +532,7 @@ generic-optimized-speed)
 apple-configuration)
     clean
 
+    step "Installing libc++ with the Apple system configuration"
     arch="$(uname -m)"
     xcrun --sdk macosx                                              \
         ${MONOREPO_ROOT}/libcxx/utils/ci/apple-install-libcxx.sh    \
@@ -512,6 +543,7 @@ apple-configuration)
             --architectures "${arch}"                               \
             --version "999.99"
 
+    step "Running tests against Apple-configured libc++"
     # TODO: It would be better to run the tests against the fake-installed version of libc++ instead
     xcrun --sdk macosx ninja -vC "${BUILD_DIR}/${arch}" check-cxx check-cxxabi check-cxx-abilist
 ;;
@@ -524,6 +556,7 @@ apple-system-hardened)
     params+=";hardening_mode=fast"
 
     # In the Apple system configuration, we build libc++ and libunwind separately.
+    step "Installing libc++ and libc++abi in Apple-system configuration"
     ${CMAKE} \
         -S "${MONOREPO_ROOT}/runtimes" \
         -B "${BUILD_DIR}/cxx" \
@@ -539,6 +572,7 @@ apple-system-hardened)
         -DLIBCXX_TEST_PARAMS="${params}" \
         -DLIBCXXABI_TEST_PARAMS="${params}"
 
+    step "Installing libunwind in Apple-system configuration"
     ${CMAKE} \
         -S "${MONOREPO_ROOT}/runtimes" \
         -B "${BUILD_DIR}/unwind" \
@@ -551,13 +585,13 @@ apple-system-hardened)
         -DLIBUNWIND_TEST_PARAMS="${params}" \
         -DCMAKE_INSTALL_NAME_DIR="/usr/lib/system"
 
-    echo "+++ Running the libc++ tests"
+    step "Running the libc++ tests"
     ${NINJA} -vC "${BUILD_DIR}/cxx" check-cxx
 
-    echo "+++ Running the libc++abi tests"
+    step "Running the libc++abi tests"
     ${NINJA} -vC "${BUILD_DIR}/cxx" check-cxxabi
 
-    echo "+++ Running the libunwind tests"
+    step "Running the libunwind tests"
     ${NINJA} -vC "${BUILD_DIR}/unwind" check-unwind
 ;;
 apple-system)
@@ -568,6 +602,7 @@ apple-system)
     params="target_triple=${arch}-apple-macosx${version}"
 
     # In the Apple system configuration, we build libc++ and libunwind separately.
+    step "Installing libc++ and libc++abi in Apple-system configuration"
     ${CMAKE} \
         -S "${MONOREPO_ROOT}/runtimes" \
         -B "${BUILD_DIR}/cxx" \
@@ -583,6 +618,7 @@ apple-system)
         -DLIBCXX_TEST_PARAMS="${params}" \
         -DLIBCXXABI_TEST_PARAMS="${params}"
 
+    step "Installing libunwind in Apple-system configuration"
     ${CMAKE} \
         -S "${MONOREPO_ROOT}/runtimes" \
         -B "${BUILD_DIR}/unwind" \
@@ -595,13 +631,13 @@ apple-system)
         -DLIBUNWIND_TEST_PARAMS="${params}" \
         -DCMAKE_INSTALL_NAME_DIR="/usr/lib/system"
 
-    echo "+++ Running the libc++ tests"
+    step "Running the libc++ tests"
     ${NINJA} -vC "${BUILD_DIR}/cxx" check-cxx
 
-    echo "+++ Running the libc++abi tests"
+    step "Running the libc++abi tests"
     ${NINJA} -vC "${BUILD_DIR}/cxx" check-cxxabi
 
-    echo "+++ Running the libunwind tests"
+    step "Running the libunwind tests"
     ${NINJA} -vC "${BUILD_DIR}/unwind" check-unwind
 ;;
 benchmarks)
@@ -664,13 +700,13 @@ clang-cl-dll)
     # anyway), thus just disable the experimental library. Remove this
     # setting when cmake and the test driver does the right thing automatically.
     generate-cmake-libcxx-win -DLIBCXX_TEST_PARAMS="enable_experimental=False"
-    echo "+++ Running the libc++ tests"
+    step "Running the libc++ tests"
     ${NINJA} -vC "${BUILD_DIR}" check-cxx
 ;;
 clang-cl-static)
     clean
     generate-cmake-libcxx-win -DLIBCXX_ENABLE_SHARED=OFF
-    echo "+++ Running the libc++ tests"
+    step "Running the libc++ tests"
     ${NINJA} -vC "${BUILD_DIR}" check-cxx
 ;;
 clang-cl-no-vcruntime)
@@ -681,14 +717,14 @@ clang-cl-no-vcruntime)
     # exceptions enabled.
     generate-cmake-libcxx-win -DLIBCXX_TEST_PARAMS="enable_experimental=False" \
                               -DLIBCXX_TEST_CONFIG="llvm-libc++-shared-no-vcruntime-clangcl.cfg.in"
-    echo "+++ Running the libc++ tests"
+    step "Running the libc++ tests"
     ${NINJA} -vC "${BUILD_DIR}" check-cxx
 ;;
 clang-cl-debug)
     clean
     generate-cmake-libcxx-win -DLIBCXX_TEST_PARAMS="enable_experimental=False" \
                               -DCMAKE_BUILD_TYPE=Debug
-    echo "+++ Running the libc++ tests"
+    step "Running the libc++ tests"
     ${NINJA} -vC "${BUILD_DIR}" check-cxx
 ;;
 clang-cl-static-crt)
@@ -697,7 +733,7 @@ clang-cl-static-crt)
     # the static CRT, as opposed to "MultiThreadedDLL" which is the default).
     generate-cmake-libcxx-win -DLIBCXX_ENABLE_SHARED=OFF \
                               -DCMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded
-    echo "+++ Running the libc++ tests"
+    step "Running the libc++ tests"
     ${NINJA} -vC "${BUILD_DIR}" check-cxx
 ;;
 mingw-dll)
@@ -743,6 +779,7 @@ mingw-incomplete-sysroot)
           -C "${MONOREPO_ROOT}/libcxx/cmake/caches/MinGW.cmake"
     # Only test that building succeeds; there's not much extra value in running
     # the tests here, as it would be equivalent to the mingw-dll config above.
+    step "Building the runtimes"
     ${NINJA} -vC "${BUILD_DIR}"
 ;;
 aix)
@@ -760,7 +797,7 @@ android-ndk-*)
     ANDROID_EMU_IMG="${BUILDER#android-ndk-}"
     . "${MONOREPO_ROOT}/libcxx/utils/ci/vendor/android/emulator-functions.sh"
     if ! validate_emu_img "${ANDROID_EMU_IMG}"; then
-        echo "error: android-ndk suffix must be a valid emulator image (${ANDROID_EMU_IMG})" >&2
+        error "android-ndk suffix must be a valid emulator image (${ANDROID_EMU_IMG})" >&2
         exit 1
     fi
     ARCH=$(arch_of_emu_img ${ANDROID_EMU_IMG})
@@ -792,9 +829,9 @@ android-ndk-*)
     # directories.
     adb shell mkdir -p /data/local/tmp/adb_run
     adb push "${BUILD_DIR}/lib/libc++_shared.so" /data/local/tmp/libc++/libc++_shared.so
-    echo "+++ Running the libc++ tests"
+    step "Running the libc++ tests"
     ${NINJA} -vC "${BUILD_DIR}" check-cxx
-    echo "+++ Running the libc++abi tests"
+    step "Running the libc++abi tests"
     ${NINJA} -vC "${BUILD_DIR}" check-cxxabi
 ;;
 #################################################################
@@ -810,3 +847,5 @@ android-ndk-*)
     exit 1
 ;;
 esac
+
+endstep # Make sure we close any still-open output group

From f7eb2715425d9cc25ec5acbcaab7eca323513f25 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Fri, 11 Oct 2024 14:36:26 +0100
Subject: [PATCH 175/177] [mlir][vector] Add more tests for ConvertVectorToLLVM
 (7/n) (#111895)

Adds tests with scalable vectors for the Vector-To-LLVM conversion pass.
Covers the following Ops:
  * vector.fma
  * vector.reduce
---
 .../VectorToLLVM/vector-to-llvm.mlir          | 308 ++++++++++++++++++
 1 file changed, 308 insertions(+)

diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
index ae1d6fe8bd1672..243082d2ba9aa9 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
@@ -2001,6 +2001,37 @@ func.func @vector_fma(%a: vector<8xf32>, %b: vector<2x4xf32>, %c: vector<1x1x1xf
   return %0, %1, %2, %3: vector<8xf32>, vector<2x4xf32>, vector<1x1x1xf32>, vector<f32>
 }
 
+func.func @vector_fma_scalable(%a: vector<[8]xf32>, %b: vector<2x[4]xf32>, %c: vector<1x1x[1]xf32>, %d: vector<f32>) -> (vector<[8]xf32>, vector<2x[4]xf32>, vector<1x1x[1]xf32>) {
+  // CHECK-LABEL: @vector_fma_scalable
+  //  CHECK-SAME: %[[A:.*]]: vector<[8]xf32>
+  //  CHECK-SAME: %[[B:.*]]: vector<2x[4]xf32>
+  //  CHECK-SAME: %[[C:.*]]: vector<1x1x[1]xf32>
+  //       CHECK: %[[BL:.*]] = builtin.unrealized_conversion_cast %[[B]] : vector<2x[4]xf32> to !llvm.array<2 x vector<[4]xf32>>
+  //       CHECK: llvm.intr.fmuladd
+  //  CHECK-SAME:   (vector<[8]xf32>, vector<[8]xf32>, vector<[8]xf32>) -> vector<[8]xf32>
+  %0 = vector.fma %a, %a, %a : vector<[8]xf32>
+
+  //       CHECK: %[[b00:.*]] = llvm.extractvalue %[[BL]][0] : !llvm.array<2 x vector<[4]xf32>>
+  //       CHECK: %[[b01:.*]] = llvm.extractvalue %[[BL]][0] : !llvm.array<2 x vector<[4]xf32>>
+  //       CHECK: %[[b02:.*]] = llvm.extractvalue %[[BL]][0] : !llvm.array<2 x vector<[4]xf32>>
+  //       CHECK: %[[B0:.*]] = llvm.intr.fmuladd(%[[b00]], %[[b01]], %[[b02]]) :
+  //  CHECK-SAME: (vector<[4]xf32>, vector<[4]xf32>, vector<[4]xf32>) -> vector<[4]xf32>
+  //       CHECK: llvm.insertvalue %[[B0]], {{.*}}[0] : !llvm.array<2 x vector<[4]xf32>>
+  //       CHECK: %[[b10:.*]] = llvm.extractvalue %[[BL]][1] : !llvm.array<2 x vector<[4]xf32>>
+  //       CHECK: %[[b11:.*]] = llvm.extractvalue %[[BL]][1] : !llvm.array<2 x vector<[4]xf32>>
+  //       CHECK: %[[b12:.*]] = llvm.extractvalue %[[BL]][1] : !llvm.array<2 x vector<[4]xf32>>
+  //       CHECK: %[[B1:.*]] = llvm.intr.fmuladd(%[[b10]], %[[b11]], %[[b12]]) :
+  //  CHECK-SAME: (vector<[4]xf32>, vector<[4]xf32>, vector<[4]xf32>) -> vector<[4]xf32>
+  //       CHECK: llvm.insertvalue %[[B1]], {{.*}}[1] : !llvm.array<2 x vector<[4]xf32>>
+  %1 = vector.fma %b, %b, %b : vector<2x[4]xf32>
+
+  //       CHECK: %[[C0:.*]] = llvm.intr.fmuladd
+  //  CHECK-SAME:   (vector<[1]xf32>, vector<[1]xf32>, vector<[1]xf32>) -> vector<[1]xf32>
+  %2 = vector.fma %c, %c, %c : vector<1x1x[1]xf32>
+
+  return %0, %1, %2: vector<[8]xf32>, vector<2x[4]xf32>, vector<1x1x[1]xf32>
+}
+
 // -----
 
 func.func @reduce_0d_f32(%arg0: vector<f32>) -> f32 {
@@ -2028,6 +2059,17 @@ func.func @reduce_f16(%arg0: vector<16xf16>) -> f16 {
 // CHECK-SAME: <{fastmathFlags = #llvm.fastmath<none>}> : (f16, vector<16xf16>) -> f16
 //      CHECK: return %[[V]] : f16
 
+func.func @reduce_f16_scalable(%arg0: vector<[16]xf16>) -> f16 {
+  %0 = vector.reduction <add>, %arg0 : vector<[16]xf16> into f16
+  return %0 : f16
+}
+// CHECK-LABEL: @reduce_f16_scalable(
+// CHECK-SAME: %[[A:.*]]: vector<[16]xf16>)
+//      CHECK: %[[C:.*]] = llvm.mlir.constant(0.000000e+00 : f16) : f16
+//      CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.fadd"(%[[C]], %[[A]])
+// CHECK-SAME: <{fastmathFlags = #llvm.fastmath<none>}> : (f16, vector<[16]xf16>) -> f16
+//      CHECK: return %[[V]] : f16
+
 // -----
 
 func.func @reduce_f32(%arg0: vector<16xf32>) -> f32 {
@@ -2041,6 +2083,17 @@ func.func @reduce_f32(%arg0: vector<16xf32>) -> f32 {
 // CHECK-SAME: <{fastmathFlags = #llvm.fastmath<none>}> : (f32, vector<16xf32>) -> f32
 //      CHECK: return %[[V]] : f32
 
+func.func @reduce_f32_scalable(%arg0: vector<[16]xf32>) -> f32 {
+  %0 = vector.reduction <add>, %arg0 : vector<[16]xf32> into f32
+  return %0 : f32
+}
+// CHECK-LABEL: @reduce_f32_scalable(
+// CHECK-SAME: %[[A:.*]]: vector<[16]xf32>)
+//      CHECK: %[[C:.*]] = llvm.mlir.constant(0.000000e+00 : f32) : f32
+//      CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.fadd"(%[[C]], %[[A]])
+// CHECK-SAME: <{fastmathFlags = #llvm.fastmath<none>}> : (f32, vector<[16]xf32>) -> f32
+//      CHECK: return %[[V]] : f32
+
 // -----
 
 func.func @reduce_f64(%arg0: vector<16xf64>) -> f64 {
@@ -2054,6 +2107,17 @@ func.func @reduce_f64(%arg0: vector<16xf64>) -> f64 {
 // CHECK-SAME: <{fastmathFlags = #llvm.fastmath<none>}> : (f64, vector<16xf64>) -> f64
 //      CHECK: return %[[V]] : f64
 
+func.func @reduce_f64_scalable(%arg0: vector<[16]xf64>) -> f64 {
+  %0 = vector.reduction <add>, %arg0 : vector<[16]xf64> into f64
+  return %0 : f64
+}
+// CHECK-LABEL: @reduce_f64_scalable(
+// CHECK-SAME: %[[A:.*]]: vector<[16]xf64>)
+//      CHECK: %[[C:.*]] = llvm.mlir.constant(0.000000e+00 : f64) : f64
+//      CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.fadd"(%[[C]], %[[A]])
+// CHECK-SAME: <{fastmathFlags = #llvm.fastmath<none>}> : (f64, vector<[16]xf64>) -> f64
+//      CHECK: return %[[V]] : f64
+
 // -----
 
 func.func @reduce_i8(%arg0: vector<16xi8>) -> i8 {
@@ -2065,6 +2129,15 @@ func.func @reduce_i8(%arg0: vector<16xi8>) -> i8 {
 //      CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.add"(%[[A]])
 //      CHECK: return %[[V]] : i8
 
+func.func @reduce_i8_scalable(%arg0: vector<[16]xi8>) -> i8 {
+  %0 = vector.reduction <add>, %arg0 : vector<[16]xi8> into i8
+  return %0 : i8
+}
+// CHECK-LABEL: @reduce_i8_scalable(
+// CHECK-SAME: %[[A:.*]]: vector<[16]xi8>)
+//      CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.add"(%[[A]])
+//      CHECK: return %[[V]] : i8
+
 // -----
 
 func.func @reduce_i32(%arg0: vector<16xi32>) -> i32 {
@@ -2076,6 +2149,15 @@ func.func @reduce_i32(%arg0: vector<16xi32>) -> i32 {
 //      CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.add"(%[[A]])
 //      CHECK: return %[[V]] : i32
 
+func.func @reduce_i32_scalable(%arg0: vector<[16]xi32>) -> i32 {
+  %0 = vector.reduction <add>, %arg0 : vector<[16]xi32> into i32
+  return %0 : i32
+}
+// CHECK-LABEL: @reduce_i32_scalable(
+// CHECK-SAME: %[[A:.*]]: vector<[16]xi32>)
+//      CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.add"(%[[A]])
+//      CHECK: return %[[V]] : i32
+
 // -----
 
 func.func @reduce_acc_i32(%arg0: vector<16xi32>, %arg1 : i32) -> i32 {
@@ -2088,6 +2170,16 @@ func.func @reduce_acc_i32(%arg0: vector<16xi32>, %arg1 : i32) -> i32 {
 //       CHECK: %[[V:.*]] = llvm.add %[[ACC]], %[[R]]
 //       CHECK: return %[[V]] : i32
 
+func.func @reduce_acc_i32_scalable(%arg0: vector<[16]xi32>, %arg1 : i32) -> i32 {
+  %0 = vector.reduction <add>, %arg0, %arg1 : vector<[16]xi32> into i32
+  return %0 : i32
+}
+// CHECK-LABEL: @reduce_acc_i32_scalable(
+//  CHECK-SAME: %[[A:.*]]: vector<[16]xi32>, %[[ACC:.*]]: i32)
+//       CHECK: %[[R:.*]] = "llvm.intr.vector.reduce.add"(%[[A]])
+//       CHECK: %[[V:.*]] = llvm.add %[[ACC]], %[[R]]
+//       CHECK: return %[[V]] : i32
+
 // -----
 
 func.func @reduce_mul_i32(%arg0: vector<16xi32>) -> i32 {
@@ -2099,6 +2191,15 @@ func.func @reduce_mul_i32(%arg0: vector<16xi32>) -> i32 {
 //       CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.mul"(%[[A]])
 //       CHECK: return %[[V]] : i32
 
+func.func @reduce_mul_i32_scalable(%arg0: vector<[16]xi32>) -> i32 {
+  %0 = vector.reduction <mul>, %arg0 : vector<[16]xi32> into i32
+  return %0 : i32
+}
+// CHECK-LABEL: @reduce_mul_i32_scalable(
+//  CHECK-SAME: %[[A:.*]]: vector<[16]xi32>)
+//       CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.mul"(%[[A]])
+//       CHECK: return %[[V]] : i32
+
 // -----
 
 func.func @reduce_mul_acc_i32(%arg0: vector<16xi32>, %arg1 : i32) -> i32 {
@@ -2111,6 +2212,16 @@ func.func @reduce_mul_acc_i32(%arg0: vector<16xi32>, %arg1 : i32) -> i32 {
 //       CHECK: %[[V:.*]] = llvm.mul %[[ACC]], %[[R]]
 //       CHECK: return %[[V]] : i32
 
+func.func @reduce_mul_acc_i32_scalable(%arg0: vector<[16]xi32>, %arg1 : i32) -> i32 {
+  %0 = vector.reduction <mul>, %arg0, %arg1 : vector<[16]xi32> into i32
+  return %0 : i32
+}
+// CHECK-LABEL: @reduce_mul_acc_i32_scalable(
+//  CHECK-SAME: %[[A:.*]]: vector<[16]xi32>, %[[ACC:.*]]: i32)
+//       CHECK: %[[R:.*]] = "llvm.intr.vector.reduce.mul"(%[[A]])
+//       CHECK: %[[V:.*]] = llvm.mul %[[ACC]], %[[R]]
+//       CHECK: return %[[V]] : i32
+
 // -----
 
 func.func @reduce_fmaximum_f32(%arg0: vector<16xf32>, %arg1: f32) -> f32 {
@@ -2123,6 +2234,16 @@ func.func @reduce_fmaximum_f32(%arg0: vector<16xf32>, %arg1: f32) -> f32 {
 //      CHECK: %[[R:.*]] = llvm.intr.maximum(%[[V]], %[[B]]) : (f32, f32) -> f32
 //      CHECK: return %[[R]] : f32
 
+func.func @reduce_fmaximum_f32_scalable(%arg0: vector<[16]xf32>, %arg1: f32) -> f32 {
+  %0 = vector.reduction <maximumf>, %arg0, %arg1 : vector<[16]xf32> into f32
+  return %0 : f32
+}
+// CHECK-LABEL: @reduce_fmaximum_f32_scalable(
+// CHECK-SAME: %[[A:.*]]: vector<[16]xf32>, %[[B:.*]]: f32)
+//      CHECK: %[[V:.*]] = llvm.intr.vector.reduce.fmaximum(%[[A]]) : (vector<[16]xf32>) -> f32
+//      CHECK: %[[R:.*]] = llvm.intr.maximum(%[[V]], %[[B]]) : (f32, f32) -> f32
+//      CHECK: return %[[R]] : f32
+
 // -----
 
 func.func @reduce_fminimum_f32(%arg0: vector<16xf32>, %arg1: f32) -> f32 {
@@ -2135,6 +2256,16 @@ func.func @reduce_fminimum_f32(%arg0: vector<16xf32>, %arg1: f32) -> f32 {
 //      CHECK: %[[R:.*]] = llvm.intr.minimum(%[[V]], %[[B]]) : (f32, f32) -> f32
 //      CHECK: return %[[R]] : f32
 
+func.func @reduce_fminimum_f32_scalable(%arg0: vector<[16]xf32>, %arg1: f32) -> f32 {
+  %0 = vector.reduction <minimumf>, %arg0, %arg1 : vector<[16]xf32> into f32
+  return %0 : f32
+}
+// CHECK-LABEL: @reduce_fminimum_f32_scalable(
+// CHECK-SAME: %[[A:.*]]: vector<[16]xf32>, %[[B:.*]]: f32)
+//      CHECK: %[[V:.*]] = llvm.intr.vector.reduce.fminimum(%[[A]]) : (vector<[16]xf32>) -> f32
+//      CHECK: %[[R:.*]] = llvm.intr.minimum(%[[V]], %[[B]]) : (f32, f32) -> f32
+//      CHECK: return %[[R]] : f32
+
 // -----
 
 func.func @reduce_fmax_f32(%arg0: vector<16xf32>, %arg1: f32) -> f32 {
@@ -2147,6 +2278,16 @@ func.func @reduce_fmax_f32(%arg0: vector<16xf32>, %arg1: f32) -> f32 {
 //      CHECK: %[[R:.*]] = llvm.intr.maxnum(%[[V]], %[[B]]) : (f32, f32) -> f32
 //      CHECK: return %[[R]] : f32
 
+func.func @reduce_fmax_f32_scalable(%arg0: vector<[16]xf32>, %arg1: f32) -> f32 {
+  %0 = vector.reduction <maxnumf>, %arg0, %arg1 : vector<[16]xf32> into f32
+  return %0 : f32
+}
+// CHECK-LABEL: @reduce_fmax_f32_scalable(
+// CHECK-SAME: %[[A:.*]]: vector<[16]xf32>, %[[B:.*]]: f32)
+//      CHECK: %[[V:.*]] = llvm.intr.vector.reduce.fmax(%[[A]]) : (vector<[16]xf32>) -> f32
+//      CHECK: %[[R:.*]] = llvm.intr.maxnum(%[[V]], %[[B]]) : (f32, f32) -> f32
+//      CHECK: return %[[R]] : f32
+
 // -----
 
 func.func @reduce_fmin_f32(%arg0: vector<16xf32>, %arg1: f32) -> f32 {
@@ -2159,6 +2300,16 @@ func.func @reduce_fmin_f32(%arg0: vector<16xf32>, %arg1: f32) -> f32 {
 //      CHECK: %[[R:.*]] = llvm.intr.minnum(%[[V]], %[[B]]) : (f32, f32) -> f32
 //      CHECK: return %[[R]] : f32
 
+func.func @reduce_fmin_f32_scalable(%arg0: vector<[16]xf32>, %arg1: f32) -> f32 {
+  %0 = vector.reduction <minnumf>, %arg0, %arg1 : vector<[16]xf32> into f32
+  return %0 : f32
+}
+// CHECK-LABEL: @reduce_fmin_f32_scalable(
+// CHECK-SAME: %[[A:.*]]: vector<[16]xf32>, %[[B:.*]]: f32)
+//      CHECK: %[[V:.*]] = llvm.intr.vector.reduce.fmin(%[[A]]) : (vector<[16]xf32>) -> f32
+//      CHECK: %[[R:.*]] = llvm.intr.minnum(%[[V]], %[[B]]) : (f32, f32) -> f32
+//      CHECK: return %[[R]] : f32
+
 // -----
 
 func.func @reduce_minui_i32(%arg0: vector<16xi32>) -> i32 {
@@ -2170,6 +2321,15 @@ func.func @reduce_minui_i32(%arg0: vector<16xi32>) -> i32 {
 //       CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.umin"(%[[A]])
 //       CHECK: return %[[V]] : i32
 
+func.func @reduce_minui_i32_scalable(%arg0: vector<[16]xi32>) -> i32 {
+  %0 = vector.reduction <minui>, %arg0 : vector<[16]xi32> into i32
+  return %0 : i32
+}
+// CHECK-LABEL: @reduce_minui_i32_scalable(
+//  CHECK-SAME: %[[A:.*]]: vector<[16]xi32>)
+//       CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.umin"(%[[A]])
+//       CHECK: return %[[V]] : i32
+
 // -----
 
 func.func @reduce_minui_acc_i32(%arg0: vector<16xi32>, %arg1 : i32) -> i32 {
@@ -2183,6 +2343,17 @@ func.func @reduce_minui_acc_i32(%arg0: vector<16xi32>, %arg1 : i32) -> i32 {
 //       CHECK: %[[V:.*]] = llvm.select %[[S]], %[[ACC]], %[[R]]
 //       CHECK: return %[[V]] : i32
 
+func.func @reduce_minui_acc_i32_scalable(%arg0: vector<[16]xi32>, %arg1 : i32) -> i32 {
+  %0 = vector.reduction <minui>, %arg0, %arg1 : vector<[16]xi32> into i32
+  return %0 : i32
+}
+// CHECK-LABEL: @reduce_minui_acc_i32_scalable(
+//  CHECK-SAME: %[[A:.*]]: vector<[16]xi32>, %[[ACC:.*]]: i32)
+//       CHECK: %[[R:.*]] = "llvm.intr.vector.reduce.umin"(%[[A]])
+//       CHECK: %[[S:.*]] = llvm.icmp "ule" %[[ACC]], %[[R]]
+//       CHECK: %[[V:.*]] = llvm.select %[[S]], %[[ACC]], %[[R]]
+//       CHECK: return %[[V]] : i32
+
 // -----
 
 func.func @reduce_maxui_i32(%arg0: vector<16xi32>) -> i32 {
@@ -2194,6 +2365,15 @@ func.func @reduce_maxui_i32(%arg0: vector<16xi32>) -> i32 {
 //       CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.umax"(%[[A]])
 //       CHECK: return %[[V]] : i32
 
+func.func @reduce_maxui_i32_scalable(%arg0: vector<[16]xi32>) -> i32 {
+  %0 = vector.reduction <maxui>, %arg0 : vector<[16]xi32> into i32
+  return %0 : i32
+}
+// CHECK-LABEL: @reduce_maxui_i32_scalable(
+//  CHECK-SAME: %[[A:.*]]: vector<[16]xi32>)
+//       CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.umax"(%[[A]])
+//       CHECK: return %[[V]] : i32
+
 // -----
 
 func.func @reduce_maxui_acc_i32(%arg0: vector<16xi32>, %arg1 : i32) -> i32 {
@@ -2207,6 +2387,17 @@ func.func @reduce_maxui_acc_i32(%arg0: vector<16xi32>, %arg1 : i32) -> i32 {
 //       CHECK: %[[V:.*]] = llvm.select %[[S]], %[[ACC]], %[[R]]
 //       CHECK: return %[[V]] : i32
 
+func.func @reduce_maxui_acc_i32_scalable(%arg0: vector<[16]xi32>, %arg1 : i32) -> i32 {
+  %0 = vector.reduction <maxui>, %arg0, %arg1 : vector<[16]xi32> into i32
+  return %0 : i32
+}
+// CHECK-LABEL: @reduce_maxui_acc_i32_scalable(
+//  CHECK-SAME: %[[A:.*]]: vector<[16]xi32>, %[[ACC:.*]]: i32)
+//       CHECK: %[[R:.*]] = "llvm.intr.vector.reduce.umax"(%[[A]])
+//       CHECK: %[[S:.*]] = llvm.icmp "uge" %[[ACC]], %[[R]]
+//       CHECK: %[[V:.*]] = llvm.select %[[S]], %[[ACC]], %[[R]]
+//       CHECK: return %[[V]] : i32
+
 // -----
 
 func.func @reduce_minsi_i32(%arg0: vector<16xi32>) -> i32 {
@@ -2218,6 +2409,15 @@ func.func @reduce_minsi_i32(%arg0: vector<16xi32>) -> i32 {
 //       CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.smin"(%[[A]])
 //       CHECK: return %[[V]] : i32
 
+func.func @reduce_minsi_i32_scalable(%arg0: vector<[16]xi32>) -> i32 {
+  %0 = vector.reduction <minsi>, %arg0 : vector<[16]xi32> into i32
+  return %0 : i32
+}
+// CHECK-LABEL: @reduce_minsi_i32_scalable(
+//  CHECK-SAME: %[[A:.*]]: vector<[16]xi32>)
+//       CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.smin"(%[[A]])
+//       CHECK: return %[[V]] : i32
+
 // -----
 
 func.func @reduce_minsi_acc_i32(%arg0: vector<16xi32>, %arg1 : i32) -> i32 {
@@ -2231,6 +2431,17 @@ func.func @reduce_minsi_acc_i32(%arg0: vector<16xi32>, %arg1 : i32) -> i32 {
 //       CHECK: %[[V:.*]] = llvm.select %[[S]], %[[ACC]], %[[R]]
 //       CHECK: return %[[V]] : i32
 
+func.func @reduce_minsi_acc_i32_scalable(%arg0: vector<[16]xi32>, %arg1 : i32) -> i32 {
+  %0 = vector.reduction <minsi>, %arg0, %arg1 : vector<[16]xi32> into i32
+  return %0 : i32
+}
+// CHECK-LABEL: @reduce_minsi_acc_i32_scalable(
+//  CHECK-SAME: %[[A:.*]]: vector<[16]xi32>, %[[ACC:.*]]: i32)
+//       CHECK: %[[R:.*]] = "llvm.intr.vector.reduce.smin"(%[[A]])
+//       CHECK: %[[S:.*]] = llvm.icmp "sle" %[[ACC]], %[[R]]
+//       CHECK: %[[V:.*]] = llvm.select %[[S]], %[[ACC]], %[[R]]
+//       CHECK: return %[[V]] : i32
+
 // -----
 
 func.func @reduce_maxsi_i32(%arg0: vector<16xi32>) -> i32 {
@@ -2242,6 +2453,15 @@ func.func @reduce_maxsi_i32(%arg0: vector<16xi32>) -> i32 {
 //       CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.smax"(%[[A]])
 //       CHECK: return %[[V]] : i32
 
+func.func @reduce_maxsi_i32_scalable(%arg0: vector<[16]xi32>) -> i32 {
+  %0 = vector.reduction <maxsi>, %arg0 : vector<[16]xi32> into i32
+  return %0 : i32
+}
+// CHECK-LABEL: @reduce_maxsi_i32_scalable(
+//  CHECK-SAME: %[[A:.*]]: vector<[16]xi32>)
+//       CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.smax"(%[[A]])
+//       CHECK: return %[[V]] : i32
+
 // -----
 
 func.func @reduce_maxsi_acc_i32(%arg0: vector<16xi32>, %arg1 : i32) -> i32 {
@@ -2255,6 +2475,17 @@ func.func @reduce_maxsi_acc_i32(%arg0: vector<16xi32>, %arg1 : i32) -> i32 {
 //       CHECK: %[[V:.*]] = llvm.select %[[S]], %[[ACC]], %[[R]]
 //       CHECK: return %[[V]] : i32
 
+func.func @reduce_maxsi_acc_i32_scalable(%arg0: vector<[16]xi32>, %arg1 : i32) -> i32 {
+  %0 = vector.reduction <maxsi>, %arg0, %arg1 : vector<[16]xi32> into i32
+  return %0 : i32
+}
+// CHECK-LABEL: @reduce_maxsi_acc_i32_scalable(
+//  CHECK-SAME: %[[A:.*]]: vector<[16]xi32>, %[[ACC:.*]]: i32)
+//       CHECK: %[[R:.*]] = "llvm.intr.vector.reduce.smax"(%[[A]])
+//       CHECK: %[[S:.*]] = llvm.icmp "sge" %[[ACC]], %[[R]]
+//       CHECK: %[[V:.*]] = llvm.select %[[S]], %[[ACC]], %[[R]]
+//       CHECK: return %[[V]] : i32
+
 // -----
 
 func.func @reduce_and_i32(%arg0: vector<16xi32>) -> i32 {
@@ -2266,6 +2497,15 @@ func.func @reduce_and_i32(%arg0: vector<16xi32>) -> i32 {
 //       CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.and"(%[[A]])
 //       CHECK: return %[[V]] : i32
 
+func.func @reduce_and_i32_scalable(%arg0: vector<[16]xi32>) -> i32 {
+  %0 = vector.reduction <and>, %arg0 : vector<[16]xi32> into i32
+  return %0 : i32
+}
+// CHECK-LABEL: @reduce_and_i32_scalable(
+//  CHECK-SAME: %[[A:.*]]: vector<[16]xi32>)
+//       CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.and"(%[[A]])
+//       CHECK: return %[[V]] : i32
+
 // -----
 
 func.func @reduce_and_acc_i32(%arg0: vector<16xi32>, %arg1 : i32) -> i32 {
@@ -2278,6 +2518,16 @@ func.func @reduce_and_acc_i32(%arg0: vector<16xi32>, %arg1 : i32) -> i32 {
 //       CHECK: %[[V:.*]] = llvm.and %[[ACC]], %[[R]]
 //       CHECK: return %[[V]] : i32
 
+func.func @reduce_and_acc_i32_scalable(%arg0: vector<[16]xi32>, %arg1 : i32) -> i32 {
+  %0 = vector.reduction <and>, %arg0, %arg1 : vector<[16]xi32> into i32
+  return %0 : i32
+}
+// CHECK-LABEL: @reduce_and_acc_i32_scalable(
+//  CHECK-SAME: %[[A:.*]]: vector<[16]xi32>, %[[ACC:.*]]: i32)
+//       CHECK: %[[R:.*]] = "llvm.intr.vector.reduce.and"(%[[A]])
+//       CHECK: %[[V:.*]] = llvm.and %[[ACC]], %[[R]]
+//       CHECK: return %[[V]] : i32
+
 // -----
 
 func.func @reduce_or_i32(%arg0: vector<16xi32>) -> i32 {
@@ -2289,6 +2539,15 @@ func.func @reduce_or_i32(%arg0: vector<16xi32>) -> i32 {
 //       CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.or"(%[[A]])
 //       CHECK: return %[[V]] : i32
 
+func.func @reduce_or_i32_scalable(%arg0: vector<[16]xi32>) -> i32 {
+  %0 = vector.reduction <or>, %arg0 : vector<[16]xi32> into i32
+  return %0 : i32
+}
+// CHECK-LABEL: @reduce_or_i32_scalable(
+//  CHECK-SAME: %[[A:.*]]: vector<[16]xi32>)
+//       CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.or"(%[[A]])
+//       CHECK: return %[[V]] : i32
+
 // -----
 
 func.func @reduce_or_acc_i32(%arg0: vector<16xi32>, %arg1 : i32) -> i32 {
@@ -2301,6 +2560,16 @@ func.func @reduce_or_acc_i32(%arg0: vector<16xi32>, %arg1 : i32) -> i32 {
 //       CHECK: %[[V:.*]] = llvm.or %[[ACC]], %[[R]]
 //       CHECK: return %[[V]] : i32
 
+func.func @reduce_or_acc_i32_scalable(%arg0: vector<[16]xi32>, %arg1 : i32) -> i32 {
+  %0 = vector.reduction <or>, %arg0, %arg1 : vector<[16]xi32> into i32
+  return %0 : i32
+}
+// CHECK-LABEL: @reduce_or_acc_i32_scalable(
+//  CHECK-SAME: %[[A:.*]]: vector<[16]xi32>, %[[ACC:.*]]: i32)
+//       CHECK: %[[R:.*]] = "llvm.intr.vector.reduce.or"(%[[A]])
+//       CHECK: %[[V:.*]] = llvm.or %[[ACC]], %[[R]]
+//       CHECK: return %[[V]] : i32
+
 // -----
 
 func.func @reduce_xor_i32(%arg0: vector<16xi32>) -> i32 {
@@ -2312,6 +2581,15 @@ func.func @reduce_xor_i32(%arg0: vector<16xi32>) -> i32 {
 //       CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.xor"(%[[A]])
 //       CHECK: return %[[V]] : i32
 
+func.func @reduce_xor_i32_scalable(%arg0: vector<[16]xi32>) -> i32 {
+  %0 = vector.reduction <xor>, %arg0 : vector<[16]xi32> into i32
+  return %0 : i32
+}
+// CHECK-LABEL: @reduce_xor_i32_scalable(
+//  CHECK-SAME: %[[A:.*]]: vector<[16]xi32>)
+//       CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.xor"(%[[A]])
+//       CHECK: return %[[V]] : i32
+
 // -----
 
 func.func @reduce_xor_acc_i32(%arg0: vector<16xi32>, %arg1 : i32) -> i32 {
@@ -2324,6 +2602,16 @@ func.func @reduce_xor_acc_i32(%arg0: vector<16xi32>, %arg1 : i32) -> i32 {
 //       CHECK: %[[V:.*]] = llvm.xor %[[ACC]], %[[R]]
 //       CHECK: return %[[V]] : i32
 
+func.func @reduce_xor_acc_i32_scalable(%arg0: vector<[16]xi32>, %arg1 : i32) -> i32 {
+  %0 = vector.reduction <xor>, %arg0, %arg1 : vector<[16]xi32> into i32
+  return %0 : i32
+}
+// CHECK-LABEL: @reduce_xor_acc_i32_scalable(
+//  CHECK-SAME: %[[A:.*]]: vector<[16]xi32>, %[[ACC:.*]]: i32)
+//       CHECK: %[[R:.*]] = "llvm.intr.vector.reduce.xor"(%[[A]])
+//       CHECK: %[[V:.*]] = llvm.xor %[[ACC]], %[[R]]
+//       CHECK: return %[[V]] : i32
+
 // -----
 
 func.func @reduce_i64(%arg0: vector<16xi64>) -> i64 {
@@ -2335,6 +2623,15 @@ func.func @reduce_i64(%arg0: vector<16xi64>) -> i64 {
 //      CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.add"(%[[A]])
 //      CHECK: return %[[V]] : i64
 
+func.func @reduce_i64_scalable(%arg0: vector<[16]xi64>) -> i64 {
+  %0 = vector.reduction <add>, %arg0 : vector<[16]xi64> into i64
+  return %0 : i64
+}
+// CHECK-LABEL: @reduce_i64_scalable(
+// CHECK-SAME: %[[A:.*]]: vector<[16]xi64>)
+//      CHECK: %[[V:.*]] = "llvm.intr.vector.reduce.add"(%[[A]])
+//      CHECK: return %[[V]] : i64
+
 // -----
 
 func.func @reduce_index(%arg0: vector<16xindex>) -> index {
@@ -2348,6 +2645,17 @@ func.func @reduce_index(%arg0: vector<16xindex>) -> index {
 //      CHECK: %[[T2:.*]] = builtin.unrealized_conversion_cast %[[T1]] : i64 to index
 //      CHECK: return %[[T2]] : index
 
+func.func @reduce_index_scalable(%arg0: vector<[16]xindex>) -> index {
+  %0 = vector.reduction <add>, %arg0 : vector<[16]xindex> into index
+  return %0 : index
+}
+// CHECK-LABEL: @reduce_index_scalable(
+// CHECK-SAME: %[[A:.*]]: vector<[16]xindex>)
+//      CHECK: %[[T0:.*]] = builtin.unrealized_conversion_cast %[[A]] : vector<[16]xindex> to vector<[16]xi64>
+//      CHECK: %[[T1:.*]] = "llvm.intr.vector.reduce.add"(%[[T0]])
+//      CHECK: %[[T2:.*]] = builtin.unrealized_conversion_cast %[[T1]] : i64 to index
+//      CHECK: return %[[T2]] : index
+
 //                          4x16                16x3               4x3
 // -----
 

From 1c94388f38c61c77d16abd9e164c78790ab23b58 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland@gmail.com>
Date: Fri, 11 Oct 2024 09:45:35 -0400
Subject: [PATCH 176/177] [RISCV] Introduce VLOptimizer pass (#108640)

The purpose of this optimization is to make the VL argument, for
instructions that have a VL argument, as small as possible. This is
implemented by visiting each instruction in reverse order and checking
that if it has a VL argument, whether the VL can be reduced.

By putting this pass before VSETVLI insertion, we see three kinds of
changes to generated code:
1. Eliminate VSETVLI instructions
2. Reduce the VL toggle on VSETVLI instructions that also change vtype
3. Reduce the VL set by a VSETVLI instruction

The list of supported instructions is currently whitelisted for safety.
In the future, we could add more instructions to `isSupportedInstr` to
support even more VL optimization.

We originally wrote this pass because vector GEP instructions do not
take a VL, which leads us to emit code that uses VL=VLMAX to implement
GEP in the RISC-V backend. As a result, some of the vector instructions
will write to lanes, specifically between the intended VL and VLMAX,
that will never be read. As an alternative to this pass, we considered
adding a vector predicated GEP instruction, but this would not fit well
into the intrinsic type system since GEP has a variable number of
arguments, each with arbitrary types. The second approach we considered
was to put this pass after VSETVLI insertion, but we found that it was
more difficult to recognize optimization opportunities, especially
across basic block boundaries -- the data flow analysis was also a bit
more expensive and complex.

While this pass solves the GEP problem, we have expanded it to handle
more cases of VL optimization, and there is opportunity for the analysis
to be improved to enable even more optimization. We have a few follow up
patches to post, but figured this would be a good start.

---------

Co-authored-by: Craig Topper <craig.topper@sifive.com>
Co-authored-by: Kito Cheng <kito.cheng@sifive.com>
---
 llvm/lib/Target/RISCV/CMakeLists.txt          |   1 +
 llvm/lib/Target/RISCV/RISCV.h                 |   3 +
 llvm/lib/Target/RISCV/RISCVTargetMachine.cpp  |  10 +-
 llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp    | 829 +++++++++++++++
 llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll  | 973 ++++++++++++++++++
 llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.ll |  10 +-
 6 files changed, 1823 insertions(+), 3 deletions(-)
 create mode 100644 llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll

diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt
index 574f1756cc733d..fd049d1a57860e 100644
--- a/llvm/lib/Target/RISCV/CMakeLists.txt
+++ b/llvm/lib/Target/RISCV/CMakeLists.txt
@@ -59,6 +59,7 @@ add_llvm_target(RISCVCodeGen
   RISCVTargetObjectFile.cpp
   RISCVTargetTransformInfo.cpp
   RISCVVectorPeephole.cpp
+  RISCVVLOptimizer.cpp
   RISCVZacasABIFix.cpp
   GISel/RISCVCallLowering.cpp
   GISel/RISCVInstructionSelector.cpp
diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h
index 482c0cce78b10f..d7bab601d545cc 100644
--- a/llvm/lib/Target/RISCV/RISCV.h
+++ b/llvm/lib/Target/RISCV/RISCV.h
@@ -102,6 +102,9 @@ void initializeRISCVO0PreLegalizerCombinerPass(PassRegistry &);
 
 FunctionPass *createRISCVPreLegalizerCombiner();
 void initializeRISCVPreLegalizerCombinerPass(PassRegistry &);
+
+FunctionPass *createRISCVVLOptimizerPass();
+void initializeRISCVVLOptimizerPass(PassRegistry &);
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 2dcac1320417c2..d819131dae8cb8 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -104,6 +104,11 @@ static cl::opt<bool> EnableVSETVLIAfterRVVRegAlloc(
     cl::desc("Insert vsetvls after vector register allocation"),
     cl::init(true));
 
+static cl::opt<bool>
+    EnableVLOptimizer("riscv-enable-vl-optimizer",
+                      cl::desc("Enable the RISC-V VL Optimizer pass"),
+                      cl::init(false), cl::Hidden);
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
   RegisterTargetMachine<RISCVTargetMachine> X(getTheRISCV32Target());
   RegisterTargetMachine<RISCVTargetMachine> Y(getTheRISCV64Target());
@@ -558,8 +563,11 @@ void RISCVPassConfig::addMachineSSAOptimization() {
 
 void RISCVPassConfig::addPreRegAlloc() {
   addPass(createRISCVPreRAExpandPseudoPass());
-  if (TM->getOptLevel() != CodeGenOptLevel::None)
+  if (TM->getOptLevel() != CodeGenOptLevel::None) {
     addPass(createRISCVMergeBaseOffsetOptPass());
+    if (EnableVLOptimizer)
+      addPass(createRISCVVLOptimizerPass());
+  }
 
   addPass(createRISCVInsertReadWriteCSRPass());
   addPass(createRISCVInsertWriteVXRMPass());
diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
new file mode 100644
index 00000000000000..90af9ef898d951
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -0,0 +1,829 @@
+//===-------------- RISCVVLOptimizer.cpp - VL Optimizer -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+//
+// This pass reduces the VL where possible at the MI level, before VSETVLI
+// instructions are inserted.
+//
+// The purpose of this optimization is to make the VL argument, for instructions
+// that have a VL argument, as small as possible. This is implemented by
+// visiting each instruction in reverse order and checking that if it has a VL
+// argument, whether the VL can be reduced.
+//
+//===---------------------------------------------------------------------===//
+
+#include "RISCV.h"
+#include "RISCVMachineFunctionInfo.h"
+#include "RISCVSubtarget.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "riscv-vl-optimizer"
+#define PASS_NAME "RISC-V VL Optimizer"
+
+namespace {
+
+class RISCVVLOptimizer : public MachineFunctionPass {
+  const MachineRegisterInfo *MRI;
+  const MachineDominatorTree *MDT;
+
+public:
+  static char ID;
+
+  RISCVVLOptimizer() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineDominatorTreeWrapperPass>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  StringRef getPassName() const override { return PASS_NAME; }
+
+private:
+  bool checkUsers(std::optional<Register> &CommonVL, MachineInstr &MI);
+  bool tryReduceVL(MachineInstr &MI);
+  bool isCandidate(const MachineInstr &MI) const;
+};
+
+} // end anonymous namespace
+
+char RISCVVLOptimizer::ID = 0;
+INITIALIZE_PASS_BEGIN(RISCVVLOptimizer, DEBUG_TYPE, PASS_NAME, false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
+INITIALIZE_PASS_END(RISCVVLOptimizer, DEBUG_TYPE, PASS_NAME, false, false)
+
+FunctionPass *llvm::createRISCVVLOptimizerPass() {
+  return new RISCVVLOptimizer();
+}
+
+/// Return true if R is a physical or virtual vector register, false otherwise.
+static bool isVectorRegClass(Register R, const MachineRegisterInfo *MRI) {
+  if (R.isPhysical())
+    return RISCV::VRRegClass.contains(R);
+  const TargetRegisterClass *RC = MRI->getRegClass(R);
+  return RISCVRI::isVRegClass(RC->TSFlags);
+}
+
+/// Represents the EMUL and EEW of a MachineOperand.
+struct OperandInfo {
+  enum class State {
+    Unknown,
+    Known,
+  } S;
+
+  // Represent as 1,2,4,8, ... and fractional indicator. This is because
+  // EMUL can take on values that don't map to RISCVII::VLMUL values exactly.
+  // For example, a mask operand can have an EMUL less than MF8.
+  std::optional<std::pair<unsigned, bool>> EMUL;
+
+  unsigned Log2EEW;
+
+  OperandInfo(RISCVII::VLMUL EMUL, unsigned Log2EEW)
+      : S(State::Known), EMUL(RISCVVType::decodeVLMUL(EMUL)), Log2EEW(Log2EEW) {
+  }
+
+  OperandInfo(std::pair<unsigned, bool> EMUL, unsigned Log2EEW)
+      : S(State::Known), EMUL(EMUL), Log2EEW(Log2EEW) {}
+
+  OperandInfo() : S(State::Unknown) {}
+
+  bool isUnknown() const { return S == State::Unknown; }
+  bool isKnown() const { return S == State::Known; }
+
+  static bool EMULAndEEWAreEqual(const OperandInfo &A, const OperandInfo &B) {
+    assert(A.isKnown() && B.isKnown() && "Both operands must be known");
+
+    return A.Log2EEW == B.Log2EEW && A.EMUL->first == B.EMUL->first &&
+           A.EMUL->second == B.EMUL->second;
+  }
+
+  void print(raw_ostream &OS) const {
+    if (isUnknown()) {
+      OS << "Unknown";
+      return;
+    }
+    assert(EMUL && "Expected EMUL to have value");
+    OS << "EMUL: ";
+    if (EMUL->second)
+      OS << "m";
+    OS << "f" << EMUL->first;
+    OS << ", EEW: " << (1 << Log2EEW);
+  }
+};
+
+static raw_ostream &operator<<(raw_ostream &OS, const OperandInfo &OI) {
+  OI.print(OS);
+  return OS;
+}
+
+namespace llvm {
+namespace RISCVVType {
+/// Return the RISCVII::VLMUL that is two times VLMul.
+/// Precondition: VLMul is not LMUL_RESERVED or LMUL_8.
+static RISCVII::VLMUL twoTimesVLMUL(RISCVII::VLMUL VLMul) {
+  switch (VLMul) {
+  case RISCVII::VLMUL::LMUL_F8:
+    return RISCVII::VLMUL::LMUL_F4;
+  case RISCVII::VLMUL::LMUL_F4:
+    return RISCVII::VLMUL::LMUL_F2;
+  case RISCVII::VLMUL::LMUL_F2:
+    return RISCVII::VLMUL::LMUL_1;
+  case RISCVII::VLMUL::LMUL_1:
+    return RISCVII::VLMUL::LMUL_2;
+  case RISCVII::VLMUL::LMUL_2:
+    return RISCVII::VLMUL::LMUL_4;
+  case RISCVII::VLMUL::LMUL_4:
+    return RISCVII::VLMUL::LMUL_8;
+  case RISCVII::VLMUL::LMUL_8:
+  default:
+    llvm_unreachable("Could not multiply VLMul by 2");
+  }
+}
+
+/// Return EMUL = (EEW / SEW) * LMUL where EEW comes from Log2EEW and LMUL and
+/// SEW are from the TSFlags of MI.
+static std::pair<unsigned, bool>
+getEMULEqualsEEWDivSEWTimesLMUL(unsigned Log2EEW, const MachineInstr &MI) {
+  RISCVII::VLMUL MIVLMUL = RISCVII::getLMul(MI.getDesc().TSFlags);
+  auto [MILMUL, MILMULIsFractional] = RISCVVType::decodeVLMUL(MIVLMUL);
+  unsigned MILog2SEW =
+      MI.getOperand(RISCVII::getSEWOpNum(MI.getDesc())).getImm();
+  unsigned MISEW = 1 << MILog2SEW;
+
+  unsigned EEW = 1 << Log2EEW;
+  // Calculate (EEW/SEW)*LMUL preserving fractions less than 1. Use GCD
+  // to put fraction in simplest form.
+  unsigned Num = EEW, Denom = MISEW;
+  int GCD = MILMULIsFractional ? std::gcd(Num, Denom * MILMUL)
+                               : std::gcd(Num * MILMUL, Denom);
+  Num = MILMULIsFractional ? Num / GCD : Num * MILMUL / GCD;
+  Denom = MILMULIsFractional ? Denom * MILMUL / GCD : Denom / GCD;
+  return std::make_pair(Num > Denom ? Num : Denom, Denom > Num);
+}
+} // end namespace RISCVVType
+} // end namespace llvm
+
+/// Dest has EEW=SEW and EMUL=LMUL. Source EEW=SEW/Factor (i.e. F2 => EEW/2).
+/// Source has EMUL=(EEW/SEW)*LMUL. LMUL and SEW comes from TSFlags of MI.
+static OperandInfo getIntegerExtensionOperandInfo(unsigned Factor,
+                                                  const MachineInstr &MI,
+                                                  const MachineOperand &MO) {
+  RISCVII::VLMUL MIVLMul = RISCVII::getLMul(MI.getDesc().TSFlags);
+  unsigned MILog2SEW =
+      MI.getOperand(RISCVII::getSEWOpNum(MI.getDesc())).getImm();
+
+  if (MO.getOperandNo() == 0)
+    return OperandInfo(MIVLMul, MILog2SEW);
+
+  unsigned MISEW = 1 << MILog2SEW;
+  unsigned EEW = MISEW / Factor;
+  unsigned Log2EEW = Log2_32(EEW);
+
+  return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(Log2EEW, MI),
+                     Log2EEW);
+}
+
+/// Check whether MO is a mask operand of MI.
+static bool isMaskOperand(const MachineInstr &MI, const MachineOperand &MO,
+                          const MachineRegisterInfo *MRI) {
+
+  if (!MO.isReg() || !isVectorRegClass(MO.getReg(), MRI))
+    return false;
+
+  const MCInstrDesc &Desc = MI.getDesc();
+  return Desc.operands()[MO.getOperandNo()].RegClass == RISCV::VMV0RegClassID;
+}
+
+/// Return the OperandInfo for MO, which is an operand of MI.
+static OperandInfo getOperandInfo(const MachineInstr &MI,
+                                  const MachineOperand &MO,
+                                  const MachineRegisterInfo *MRI) {
+  const RISCVVPseudosTable::PseudoInfo *RVV =
+      RISCVVPseudosTable::getPseudoInfo(MI.getOpcode());
+  assert(RVV && "Could not find MI in PseudoTable");
+
+  // MI has a VLMUL and SEW associated with it. The RVV specification defines
+  // the LMUL and SEW of each operand and definition in relation to MI.VLMUL and
+  // MI.SEW.
+  RISCVII::VLMUL MIVLMul = RISCVII::getLMul(MI.getDesc().TSFlags);
+  unsigned MILog2SEW =
+      MI.getOperand(RISCVII::getSEWOpNum(MI.getDesc())).getImm();
+
+  const bool HasPassthru = RISCVII::isFirstDefTiedToFirstUse(MI.getDesc());
+
+  // We bail out early for instructions that have passthru with non NoRegister,
+  // which means they are using TU policy. We are not interested in these
+  // since they must preserve the entire register content.
+  if (HasPassthru && MO.getOperandNo() == MI.getNumExplicitDefs() &&
+      (MO.getReg() != RISCV::NoRegister))
+    return {};
+
+  bool IsMODef = MO.getOperandNo() == 0;
+
+  // All mask operands have EEW=1, EMUL=(EEW/SEW)*LMUL
+  if (isMaskOperand(MI, MO, MRI))
+    return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(0, MI), 0);
+
+  // switch against BaseInstr to reduce number of cases that need to be
+  // considered.
+  switch (RVV->BaseInstr) {
+
+  // 6. Configuration-Setting Instructions
+  // Configuration setting instructions do not read or write vector registers
+  case RISCV::VSETIVLI:
+  case RISCV::VSETVL:
+  case RISCV::VSETVLI:
+    llvm_unreachable("Configuration setting instructions do not read or write "
+                     "vector registers");
+
+  // 11. Vector Integer Arithmetic Instructions
+  // 11.1. Vector Single-Width Integer Add and Subtract
+  case RISCV::VADD_VI:
+  case RISCV::VADD_VV:
+  case RISCV::VADD_VX:
+  case RISCV::VSUB_VV:
+  case RISCV::VSUB_VX:
+  case RISCV::VRSUB_VI:
+  case RISCV::VRSUB_VX:
+  // 11.5. Vector Bitwise Logical Instructions
+  // 11.6. Vector Single-Width Shift Instructions
+  // EEW=SEW. EMUL=LMUL.
+  case RISCV::VAND_VI:
+  case RISCV::VAND_VV:
+  case RISCV::VAND_VX:
+  case RISCV::VOR_VI:
+  case RISCV::VOR_VV:
+  case RISCV::VOR_VX:
+  case RISCV::VXOR_VI:
+  case RISCV::VXOR_VV:
+  case RISCV::VXOR_VX:
+  case RISCV::VSLL_VI:
+  case RISCV::VSLL_VV:
+  case RISCV::VSLL_VX:
+  case RISCV::VSRL_VI:
+  case RISCV::VSRL_VV:
+  case RISCV::VSRL_VX:
+  case RISCV::VSRA_VI:
+  case RISCV::VSRA_VV:
+  case RISCV::VSRA_VX:
+  // 11.9. Vector Integer Min/Max Instructions
+  // EEW=SEW. EMUL=LMUL.
+  case RISCV::VMINU_VV:
+  case RISCV::VMINU_VX:
+  case RISCV::VMIN_VV:
+  case RISCV::VMIN_VX:
+  case RISCV::VMAXU_VV:
+  case RISCV::VMAXU_VX:
+  case RISCV::VMAX_VV:
+  case RISCV::VMAX_VX:
+  // 11.10. Vector Single-Width Integer Multiply Instructions
+  // Source and Dest EEW=SEW and EMUL=LMUL.
+  case RISCV::VMUL_VV:
+  case RISCV::VMUL_VX:
+  case RISCV::VMULH_VV:
+  case RISCV::VMULH_VX:
+  case RISCV::VMULHU_VV:
+  case RISCV::VMULHU_VX:
+  case RISCV::VMULHSU_VV:
+  case RISCV::VMULHSU_VX:
+  // 11.11. Vector Integer Divide Instructions
+  // EEW=SEW. EMUL=LMUL.
+  case RISCV::VDIVU_VV:
+  case RISCV::VDIVU_VX:
+  case RISCV::VDIV_VV:
+  case RISCV::VDIV_VX:
+  case RISCV::VREMU_VV:
+  case RISCV::VREMU_VX:
+  case RISCV::VREM_VV:
+  case RISCV::VREM_VX:
+  // 11.13. Vector Single-Width Integer Multiply-Add Instructions
+  // EEW=SEW. EMUL=LMUL.
+  case RISCV::VMACC_VV:
+  case RISCV::VMACC_VX:
+  case RISCV::VNMSAC_VV:
+  case RISCV::VNMSAC_VX:
+  case RISCV::VMADD_VV:
+  case RISCV::VMADD_VX:
+  case RISCV::VNMSUB_VV:
+  case RISCV::VNMSUB_VX:
+  // 11.15. Vector Integer Merge Instructions
+  // EEW=SEW and EMUL=LMUL, except the mask operand has EEW=1 and EMUL=
+  // (EEW/SEW)*LMUL. Mask operand is handled before this switch.
+  case RISCV::VMERGE_VIM:
+  case RISCV::VMERGE_VVM:
+  case RISCV::VMERGE_VXM:
+  // 11.16. Vector Integer Move Instructions
+  // 12. Vector Fixed-Point Arithmetic Instructions
+  // 12.1. Vector Single-Width Saturating Add and Subtract
+  // 12.2. Vector Single-Width Averaging Add and Subtract
+  // EEW=SEW. EMUL=LMUL.
+  case RISCV::VMV_V_I:
+  case RISCV::VMV_V_V:
+  case RISCV::VMV_V_X:
+  case RISCV::VSADDU_VI:
+  case RISCV::VSADDU_VV:
+  case RISCV::VSADDU_VX:
+  case RISCV::VSADD_VI:
+  case RISCV::VSADD_VV:
+  case RISCV::VSADD_VX:
+  case RISCV::VSSUBU_VV:
+  case RISCV::VSSUBU_VX:
+  case RISCV::VSSUB_VV:
+  case RISCV::VSSUB_VX:
+  case RISCV::VAADDU_VV:
+  case RISCV::VAADDU_VX:
+  case RISCV::VAADD_VV:
+  case RISCV::VAADD_VX:
+  case RISCV::VASUBU_VV:
+  case RISCV::VASUBU_VX:
+  case RISCV::VASUB_VV:
+  case RISCV::VASUB_VX:
+  // 12.4. Vector Single-Width Scaling Shift Instructions
+  // EEW=SEW. EMUL=LMUL.
+  case RISCV::VSSRL_VI:
+  case RISCV::VSSRL_VV:
+  case RISCV::VSSRL_VX:
+  case RISCV::VSSRA_VI:
+  case RISCV::VSSRA_VV:
+  case RISCV::VSSRA_VX:
+  // 16. Vector Permutation Instructions
+  // 16.1. Integer Scalar Move Instructions
+  // 16.2. Floating-Point Scalar Move Instructions
+  // EMUL=LMUL. EEW=SEW.
+  case RISCV::VMV_X_S:
+  case RISCV::VMV_S_X:
+  case RISCV::VFMV_F_S:
+  case RISCV::VFMV_S_F:
+  // 16.3. Vector Slide Instructions
+  // EMUL=LMUL. EEW=SEW.
+  case RISCV::VSLIDEUP_VI:
+  case RISCV::VSLIDEUP_VX:
+  case RISCV::VSLIDEDOWN_VI:
+  case RISCV::VSLIDEDOWN_VX:
+  case RISCV::VSLIDE1UP_VX:
+  case RISCV::VFSLIDE1UP_VF:
+  case RISCV::VSLIDE1DOWN_VX:
+  case RISCV::VFSLIDE1DOWN_VF:
+  // 16.4. Vector Register Gather Instructions
+  // EMUL=LMUL. EEW=SEW. For mask operand, EMUL=1 and EEW=1.
+  case RISCV::VRGATHER_VI:
+  case RISCV::VRGATHER_VV:
+  case RISCV::VRGATHER_VX:
+  // 16.5. Vector Compress Instruction
+  // EMUL=LMUL. EEW=SEW.
+  case RISCV::VCOMPRESS_VM:
+    return OperandInfo(MIVLMul, MILog2SEW);
+
+  // 11.2. Vector Widening Integer Add/Subtract
+  // Def uses EEW=2*SEW and EMUL=2*LMUL. Operands use EEW=SEW and EMUL=LMUL.
+  case RISCV::VWADDU_VV:
+  case RISCV::VWADDU_VX:
+  case RISCV::VWSUBU_VV:
+  case RISCV::VWSUBU_VX:
+  case RISCV::VWADD_VV:
+  case RISCV::VWADD_VX:
+  case RISCV::VWSUB_VV:
+  case RISCV::VWSUB_VX:
+  case RISCV::VWSLL_VI:
+  // 11.12. Vector Widening Integer Multiply Instructions
+  // Source and Destination EMUL=LMUL. Destination EEW=2*SEW. Source EEW=SEW.
+  case RISCV::VWMUL_VV:
+  case RISCV::VWMUL_VX:
+  case RISCV::VWMULSU_VV:
+  case RISCV::VWMULSU_VX:
+  case RISCV::VWMULU_VV:
+  case RISCV::VWMULU_VX: {
+    unsigned Log2EEW = IsMODef ? MILog2SEW + 1 : MILog2SEW;
+    RISCVII::VLMUL EMUL =
+        IsMODef ? RISCVVType::twoTimesVLMUL(MIVLMul) : MIVLMul;
+    return OperandInfo(EMUL, Log2EEW);
+  }
+
+  // Def and Op1 uses EEW=2*SEW and EMUL=2*LMUL. Op2 uses EEW=SEW and EMUL=LMUL
+  case RISCV::VWADDU_WV:
+  case RISCV::VWADDU_WX:
+  case RISCV::VWSUBU_WV:
+  case RISCV::VWSUBU_WX:
+  case RISCV::VWADD_WV:
+  case RISCV::VWADD_WX:
+  case RISCV::VWSUB_WV:
+  case RISCV::VWSUB_WX:
+  // 11.14. Vector Widening Integer Multiply-Add Instructions
+  // Destination EEW=2*SEW and EMUL=2*LMUL. Source EEW=SEW and EMUL=LMUL.
+  // Even though the add is a 2*SEW addition, the operands of the add are the
+  // Dest which is 2*SEW and the result of the multiply which is 2*SEW.
+  case RISCV::VWMACCU_VV:
+  case RISCV::VWMACCU_VX:
+  case RISCV::VWMACC_VV:
+  case RISCV::VWMACC_VX:
+  case RISCV::VWMACCSU_VV:
+  case RISCV::VWMACCSU_VX:
+  case RISCV::VWMACCUS_VX: {
+    bool IsOp1 = HasPassthru ? MO.getOperandNo() == 1 : MO.getOperandNo() == 2;
+    bool TwoTimes = IsMODef || IsOp1;
+    unsigned Log2EEW = TwoTimes ? MILog2SEW + 1 : MILog2SEW;
+    RISCVII::VLMUL EMUL =
+        TwoTimes ? RISCVVType::twoTimesVLMUL(MIVLMul) : MIVLMul;
+    return OperandInfo(EMUL, Log2EEW);
+  }
+
+  // 11.3. Vector Integer Extension
+  case RISCV::VZEXT_VF2:
+  case RISCV::VSEXT_VF2:
+    return getIntegerExtensionOperandInfo(2, MI, MO);
+  case RISCV::VZEXT_VF4:
+  case RISCV::VSEXT_VF4:
+    return getIntegerExtensionOperandInfo(4, MI, MO);
+  case RISCV::VZEXT_VF8:
+  case RISCV::VSEXT_VF8:
+    return getIntegerExtensionOperandInfo(8, MI, MO);
+
+  // 11.7. Vector Narrowing Integer Right Shift Instructions
+  // Destination EEW=SEW and EMUL=LMUL, Op 1 has EEW=2*SEW EMUL=2*LMUL. Op2 has
+  // EEW=SEW EMUL=LMUL.
+  case RISCV::VNSRL_WX:
+  case RISCV::VNSRL_WI:
+  case RISCV::VNSRL_WV:
+  case RISCV::VNSRA_WI:
+  case RISCV::VNSRA_WV:
+  case RISCV::VNSRA_WX:
+  // 12.5. Vector Narrowing Fixed-Point Clip Instructions
+  // Destination and Op1 EEW=SEW and EMUL=LMUL. Op2 EEW=2*SEW and EMUL=2*LMUL
+  case RISCV::VNCLIPU_WI:
+  case RISCV::VNCLIPU_WV:
+  case RISCV::VNCLIPU_WX:
+  case RISCV::VNCLIP_WI:
+  case RISCV::VNCLIP_WV:
+  case RISCV::VNCLIP_WX: {
+    bool IsOp1 = HasPassthru ? MO.getOperandNo() == 1 : MO.getOperandNo() == 2;
+    bool TwoTimes = IsOp1;
+    unsigned Log2EEW = TwoTimes ? MILog2SEW + 1 : MILog2SEW;
+    RISCVII::VLMUL EMUL =
+        TwoTimes ? RISCVVType::twoTimesVLMUL(MIVLMul) : MIVLMul;
+    return OperandInfo(EMUL, Log2EEW);
+  }
+
+  default:
+    return {};
+  }
+}
+
+/// Return true if this optimization should consider MI for VL reduction. This
+/// white-list approach simplifies this optimization for instructions that may
+/// have more complex semantics with relation to how it uses VL.
+static bool isSupportedInstr(const MachineInstr &MI) {
+  const RISCVVPseudosTable::PseudoInfo *RVV =
+      RISCVVPseudosTable::getPseudoInfo(MI.getOpcode());
+
+  if (!RVV)
+    return false;
+
+  switch (RVV->BaseInstr) {
+  // 11.1. Vector Single-Width Integer Add and Subtract
+  case RISCV::VADD_VI:
+  case RISCV::VADD_VV:
+  case RISCV::VADD_VX:
+  case RISCV::VSUB_VV:
+  case RISCV::VSUB_VX:
+  case RISCV::VRSUB_VI:
+  case RISCV::VRSUB_VX:
+  // 11.2. Vector Widening Integer Add/Subtract
+  case RISCV::VWADDU_VV:
+  case RISCV::VWADDU_VX:
+  case RISCV::VWSUBU_VV:
+  case RISCV::VWSUBU_VX:
+  case RISCV::VWADD_VV:
+  case RISCV::VWADD_VX:
+  case RISCV::VWSUB_VV:
+  case RISCV::VWSUB_VX:
+  case RISCV::VWADDU_WV:
+  case RISCV::VWADDU_WX:
+  case RISCV::VWSUBU_WV:
+  case RISCV::VWSUBU_WX:
+  case RISCV::VWADD_WV:
+  case RISCV::VWADD_WX:
+  case RISCV::VWSUB_WV:
+  case RISCV::VWSUB_WX:
+  // 11.3. Vector Integer Extension
+  case RISCV::VZEXT_VF2:
+  case RISCV::VSEXT_VF2:
+  case RISCV::VZEXT_VF4:
+  case RISCV::VSEXT_VF4:
+  case RISCV::VZEXT_VF8:
+  case RISCV::VSEXT_VF8:
+  // 11.4. Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions
+  // FIXME: Add support for 11.4 instructions
+  // 11.5. Vector Bitwise Logical Instructions
+  // FIXME: Add support for 11.5 instructions
+  // 11.6. Vector Single-Width Shift Instructions
+  // FIXME: Add support for 11.6 instructions
+  case RISCV::VSLL_VI:
+  // 11.7. Vector Narrowing Integer Right Shift Instructions
+  // FIXME: Add support for 11.7 instructions
+  case RISCV::VNSRL_WI:
+  // 11.8 Vector Integer Compare Instructions
+  // FIXME: Add support for 11.8 instructions
+  // 11.9. Vector Integer Min/Max Instructions
+  // FIXME: Add support for 11.9 instructions
+  // 11.10. Vector Single-Width Integer Multiply Instructions
+  case RISCV::VMUL_VV:
+  case RISCV::VMUL_VX:
+  case RISCV::VMULH_VV:
+  case RISCV::VMULH_VX:
+  case RISCV::VMULHU_VV:
+  case RISCV::VMULHU_VX:
+  case RISCV::VMULHSU_VV:
+  case RISCV::VMULHSU_VX:
+  // 11.11. Vector Integer Divide Instructions
+  // FIXME: Add support for 11.11 instructions
+  // 11.12. Vector Widening Integer Multiply Instructions
+  // FIXME: Add support for 11.12 instructions
+  // 11.13. Vector Single-Width Integer Multiply-Add Instructions
+  // FIXME: Add support for 11.13 instructions
+  // 11.14. Vector Widening Integer Multiply-Add Instructions
+  // FIXME: Add support for 11.14 instructions
+  case RISCV::VWMACC_VX:
+  case RISCV::VWMACCU_VX:
+  // 11.15. Vector Integer Merge Instructions
+  // FIXME: Add support for 11.15 instructions
+  // 11.16. Vector Integer Move Instructions
+  // FIXME: Add support for 11.16 instructions
+  case RISCV::VMV_V_I:
+  case RISCV::VMV_V_X:
+
+  // Vector Crypto
+  case RISCV::VWSLL_VI:
+    return true;
+  }
+
+  return false;
+}
+
+/// Return true if MO is a vector operand but is used as a scalar operand.
+static bool isVectorOpUsedAsScalarOp(MachineOperand &MO) {
+  MachineInstr *MI = MO.getParent();
+  const RISCVVPseudosTable::PseudoInfo *RVV =
+      RISCVVPseudosTable::getPseudoInfo(MI->getOpcode());
+
+  if (!RVV)
+    return false;
+
+  switch (RVV->BaseInstr) {
+  // Reductions only use vs1[0] of vs1
+  case RISCV::VREDAND_VS:
+  case RISCV::VREDMAX_VS:
+  case RISCV::VREDMAXU_VS:
+  case RISCV::VREDMIN_VS:
+  case RISCV::VREDMINU_VS:
+  case RISCV::VREDOR_VS:
+  case RISCV::VREDSUM_VS:
+  case RISCV::VREDXOR_VS:
+  case RISCV::VWREDSUM_VS:
+  case RISCV::VWREDSUMU_VS:
+  case RISCV::VFREDMAX_VS:
+  case RISCV::VFREDMIN_VS:
+  case RISCV::VFREDOSUM_VS:
+  case RISCV::VFREDUSUM_VS:
+  case RISCV::VFWREDOSUM_VS:
+  case RISCV::VFWREDUSUM_VS: {
+    bool HasPassthru = RISCVII::isFirstDefTiedToFirstUse(MI->getDesc());
+    return HasPassthru ? MO.getOperandNo() == 2 : MO.getOperandNo() == 3;
+  }
+  default:
+    return false;
+  }
+}
+
+/// Return true if MI may read elements past VL.
+static bool mayReadPastVL(const MachineInstr &MI) {
+  const RISCVVPseudosTable::PseudoInfo *RVV =
+      RISCVVPseudosTable::getPseudoInfo(MI.getOpcode());
+  if (!RVV)
+    return true;
+
+  switch (RVV->BaseInstr) {
+  // vslidedown instructions may read elements past VL. They are handled
+  // according to current tail policy.
+  case RISCV::VSLIDEDOWN_VI:
+  case RISCV::VSLIDEDOWN_VX:
+  case RISCV::VSLIDE1DOWN_VX:
+  case RISCV::VFSLIDE1DOWN_VF:
+
+  // vrgather instructions may read the source vector at any index < VLMAX,
+  // regardless of VL.
+  case RISCV::VRGATHER_VI:
+  case RISCV::VRGATHER_VV:
+  case RISCV::VRGATHER_VX:
+  case RISCV::VRGATHEREI16_VV:
+    return true;
+
+  default:
+    return false;
+  }
+}
+
+bool RISCVVLOptimizer::isCandidate(const MachineInstr &MI) const {
+  const MCInstrDesc &Desc = MI.getDesc();
+  if (!RISCVII::hasVLOp(Desc.TSFlags) || !RISCVII::hasSEWOp(Desc.TSFlags))
+    return false;
+  if (MI.getNumDefs() != 1)
+    return false;
+
+  unsigned VLOpNum = RISCVII::getVLOpNum(Desc);
+  const MachineOperand &VLOp = MI.getOperand(VLOpNum);
+  if (!VLOp.isImm() || VLOp.getImm() != RISCV::VLMaxSentinel)
+    return false;
+
+  // Some instructions that produce vectors have semantics that make it more
+  // difficult to determine whether the VL can be reduced. For example, some
+  // instructions, such as reductions, may write lanes past VL to a scalar
+  // register. Other instructions, such as some loads or stores, may write
+  // lower lanes using data from higher lanes. There may be other complex
+  // semantics not mentioned here that make it hard to determine whether
+  // the VL can be optimized. As a result, a white-list of supported
+  // instructions is used. Over time, more instructions cam be supported
+  // upon careful examination of their semantics under the logic in this
+  // optimization.
+  // TODO: Use a better approach than a white-list, such as adding
+  // properties to instructions using something like TSFlags.
+  if (!isSupportedInstr(MI)) {
+    LLVM_DEBUG(dbgs() << "Not a candidate due to unsupported instruction\n");
+    return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "Found a candidate for VL reduction: " << MI << "\n");
+  return true;
+}
+
+bool RISCVVLOptimizer::checkUsers(std::optional<Register> &CommonVL,
+                                  MachineInstr &MI) {
+  // FIXME: Avoid visiting each user for each time we visit something on the
+  // worklist, combined with an extra visit from the outer loop. Restructure
+  // along lines of an instcombine style worklist which integrates the outer
+  // pass.
+  bool CanReduceVL = true;
+  for (auto &UserOp : MRI->use_operands(MI.getOperand(0).getReg())) {
+    const MachineInstr &UserMI = *UserOp.getParent();
+    LLVM_DEBUG(dbgs() << "  Checking user: " << UserMI << "\n");
+
+    // Instructions like reductions may use a vector register as a scalar
+    // register. In this case, we should treat it like a scalar register which
+    // does not impact the decision on whether to optimize VL.
+    if (isVectorOpUsedAsScalarOp(UserOp)) {
+      [[maybe_unused]] Register R = UserOp.getReg();
+      [[maybe_unused]] const TargetRegisterClass *RC = MRI->getRegClass(R);
+      assert(RISCV::VRRegClass.hasSubClassEq(RC) &&
+             "Expect LMUL 1 register class for vector as scalar operands!");
+      LLVM_DEBUG(dbgs() << "    Use this operand as a scalar operand\n");
+      continue;
+    }
+
+    if (mayReadPastVL(UserMI)) {
+      LLVM_DEBUG(dbgs() << "    Abort because used by unsafe instruction\n");
+      CanReduceVL = false;
+      break;
+    }
+
+    // Tied operands might pass through.
+    if (UserOp.isTied()) {
+      LLVM_DEBUG(dbgs() << "    Abort because user used as tied operand\n");
+      CanReduceVL = false;
+      break;
+    }
+
+    const MCInstrDesc &Desc = UserMI.getDesc();
+    if (!RISCVII::hasVLOp(Desc.TSFlags) || !RISCVII::hasSEWOp(Desc.TSFlags)) {
+      LLVM_DEBUG(dbgs() << "    Abort due to lack of VL or SEW, assume that"
+                           " use VLMAX\n");
+      CanReduceVL = false;
+      break;
+    }
+
+    unsigned VLOpNum = RISCVII::getVLOpNum(Desc);
+    const MachineOperand &VLOp = UserMI.getOperand(VLOpNum);
+    // Looking for a register VL that isn't X0.
+    if (!VLOp.isReg() || VLOp.getReg() == RISCV::X0) {
+      LLVM_DEBUG(dbgs() << "    Abort due to user uses X0 as VL.\n");
+      CanReduceVL = false;
+      break;
+    }
+
+    if (!CommonVL) {
+      CommonVL = VLOp.getReg();
+    } else if (*CommonVL != VLOp.getReg()) {
+      LLVM_DEBUG(dbgs() << "    Abort because users have different VL\n");
+      CanReduceVL = false;
+      break;
+    }
+
+    // The SEW and LMUL of destination and source registers need to match.
+
+    // We know that MI DEF is a vector register, because that was the guard
+    // to call this function.
+    assert(isVectorRegClass(UserMI.getOperand(0).getReg(), MRI) &&
+           "Expected DEF and USE to be vector registers");
+
+    OperandInfo ConsumerInfo = getOperandInfo(UserMI, UserOp, MRI);
+    OperandInfo ProducerInfo = getOperandInfo(MI, MI.getOperand(0), MRI);
+    if (ConsumerInfo.isUnknown() || ProducerInfo.isUnknown() ||
+        !OperandInfo::EMULAndEEWAreEqual(ConsumerInfo, ProducerInfo)) {
+      LLVM_DEBUG(dbgs() << "    Abort due to incompatible or unknown "
+                           "information for EMUL or EEW.\n");
+      LLVM_DEBUG(dbgs() << "      ConsumerInfo is: " << ConsumerInfo << "\n");
+      LLVM_DEBUG(dbgs() << "      ProducerInfo is: " << ProducerInfo << "\n");
+      CanReduceVL = false;
+      break;
+    }
+  }
+  return CanReduceVL;
+}
+
+bool RISCVVLOptimizer::tryReduceVL(MachineInstr &OrigMI) {
+  SetVector<MachineInstr *> Worklist;
+  Worklist.insert(&OrigMI);
+
+  bool MadeChange = false;
+  while (!Worklist.empty()) {
+    MachineInstr &MI = *Worklist.pop_back_val();
+    LLVM_DEBUG(dbgs() << "Trying to reduce VL for " << MI << "\n");
+
+    std::optional<Register> CommonVL;
+    bool CanReduceVL = true;
+    if (isVectorRegClass(MI.getOperand(0).getReg(), MRI))
+      CanReduceVL = checkUsers(CommonVL, MI);
+
+    if (!CanReduceVL || !CommonVL)
+      continue;
+
+    if (!CommonVL->isVirtual()) {
+      LLVM_DEBUG(
+          dbgs() << "    Abort due to new VL is not virtual register.\n");
+      continue;
+    }
+
+    const MachineInstr *VLMI = MRI->getVRegDef(*CommonVL);
+    if (!MDT->dominates(VLMI, &MI))
+      continue;
+
+    // All our checks passed. We can reduce VL.
+    LLVM_DEBUG(dbgs() << "    Reducing VL for: " << MI << "\n");
+    unsigned VLOpNum = RISCVII::getVLOpNum(MI.getDesc());
+    MachineOperand &VLOp = MI.getOperand(VLOpNum);
+    VLOp.ChangeToRegister(*CommonVL, false);
+    MadeChange = true;
+
+    // Now add all inputs to this instruction to the worklist.
+    for (auto &Op : MI.operands()) {
+      if (!Op.isReg() || !Op.isUse() || !Op.getReg().isVirtual())
+        continue;
+
+      if (!isVectorRegClass(Op.getReg(), MRI))
+        continue;
+
+      MachineInstr *DefMI = MRI->getVRegDef(Op.getReg());
+
+      if (!isCandidate(*DefMI))
+        continue;
+
+      Worklist.insert(DefMI);
+    }
+  }
+
+  return MadeChange;
+}
+
+bool RISCVVLOptimizer::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  MRI = &MF.getRegInfo();
+  MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
+
+  const RISCVSubtarget &ST = MF.getSubtarget<RISCVSubtarget>();
+  if (!ST.hasVInstructions())
+    return false;
+
+  bool MadeChange = false;
+  for (MachineBasicBlock &MBB : MF) {
+    // Visit instructions in reverse order.
+    for (auto &MI : make_range(MBB.rbegin(), MBB.rend())) {
+      if (!isCandidate(MI))
+        continue;
+
+      MadeChange |= tryReduceVL(MI);
+    }
+  }
+
+  return MadeChange;
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
new file mode 100644
index 00000000000000..107252338829bd
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
@@ -0,0 +1,973 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvbb -riscv-enable-vl-optimizer=false -verify-machineinstrs | FileCheck %s --check-prefixes=NOVLOPT
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvbb -riscv-enable-vl-optimizer=false -verify-machineinstrs | FileCheck %s --check-prefixes=NOVLOPT
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvbb -riscv-enable-vl-optimizer -verify-machineinstrs | FileCheck %s --check-prefixes=VLOPT
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvbb -riscv-enable-vl-optimizer -verify-machineinstrs | FileCheck %s --check-prefixes=VLOPT
+
+; The purpose of this file is to check the behavior of specific instructions as it relates to the VL optimizer
+
+define <vscale x 4 x i32> @vadd_vi(<vscale x 4 x i32> %a, iXLen %vl) {
+; NOVLOPT-LABEL: vadd_vi:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vi v10, v8, 5
+; NOVLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v10, v8
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vadd_vi:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vadd.vi v10, v8, 5
+; VLOPT-NEXT:    vadd.vv v8, v10, v8
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 5, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vadd_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vadd_vv:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v8, v10
+; NOVLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v8, v10
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vadd_vv:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v8, v10
+; VLOPT-NEXT:    vadd.vv v8, v8, v10
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vadd_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
+; NOVLOPT-LABEL: vadd_vx:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vx v10, v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v10, v8
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vadd_vx:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vadd.vx v10, v8, a0
+; VLOPT-NEXT:    vadd.vv v8, v10, v8
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vsub_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vsub_vv:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vsub.vv v8, v8, v10
+; NOVLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vsub.vv v8, v8, v10
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vsub_vv:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vsub.vv v8, v8, v10
+; VLOPT-NEXT:    vsub.vv v8, v8, v10
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vsub.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vsub.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vsub_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
+; NOVLOPT-LABEL: vsub_vx:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vsub.vx v10, v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vsub.vv v8, v10, v8
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vsub_vx:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vsub.vx v10, v8, a0
+; VLOPT-NEXT:    vsub.vv v8, v10, v8
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vsub.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vsub.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vrsub_vi(<vscale x 4 x i32> %a, iXLen %vl) {
+; NOVLOPT-LABEL: vrsub_vi:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vrsub.vi v10, v8, 5
+; NOVLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v10, v8
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vrsub_vi:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vrsub.vi v10, v8, 5
+; VLOPT-NEXT:    vadd.vv v8, v10, v8
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vrsub.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 5, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vrsub_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
+; NOVLOPT-LABEL: vrsub_vx:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vrsub.vx v10, v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vsub.vv v8, v10, v8
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vrsub_vx:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vrsub.vx v10, v8, a0
+; VLOPT-NEXT:    vsub.vv v8, v10, v8
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vrsub.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vsub.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i64> @vwaddu_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwaddu_vv:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwaddu.vv v12, v8, v10
+; NOVLOPT-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v12, v12
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwaddu_vv:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vwaddu.vv v12, v8, v10
+; VLOPT-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v12, v12
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vwaddu.nxv4i64.nxv4i32.nxv4i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwaddu_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwaddu_vx:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwaddu.vx v12, v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v12, v12
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwaddu_vx:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vwaddu.vx v12, v8, a0
+; VLOPT-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v12, v12
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vwaddu.nxv4i64.nxv4i32.i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwsubu_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwsubu_vv:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwsubu.vv v12, v8, v10
+; NOVLOPT-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v12, v12
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwsubu_vv:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vwsubu.vv v12, v8, v10
+; VLOPT-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v12, v12
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vwsubu.nxv4i64.nxv4i32.nxv4i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwsubu_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwsubu_vx:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwsubu.vx v12, v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v12, v12
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwsubu_vx:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vwsubu.vx v12, v8, a0
+; VLOPT-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v12, v12
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vwsubu.nxv4i64.nxv4i32.i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwadd_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwadd_vv:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwadd.vv v12, v8, v10
+; NOVLOPT-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v12, v12
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwadd_vv:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vwadd.vv v12, v8, v10
+; VLOPT-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v12, v12
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vwadd.nxv4i64.nxv4i32.nxv4i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwadd_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwadd_vx:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwadd.vx v12, v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v12, v12
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwadd_vx:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vwadd.vx v12, v8, a0
+; VLOPT-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v12, v12
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vwadd.nxv4i64.nxv4i32.i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwsub_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwsub_vv:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwsub.vv v12, v8, v10
+; NOVLOPT-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v12, v12
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwsub_vv:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vwsub.vv v12, v8, v10
+; VLOPT-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v12, v12
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vwsub.nxv4i64.nxv4i32.nxv4i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwsub_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwsub_vx:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwsub.vx v12, v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v12, v12
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwsub_vx:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vwsub.vx v12, v8, a0
+; VLOPT-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v12, v12
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vwsub.nxv4i64.nxv4i32.i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwaddu_wv(<vscale x 4 x i64> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwaddu_wv:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwaddu.wv v8, v8, v12
+; NOVLOPT-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v8, v8
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwaddu_wv:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vwaddu.wv v8, v8, v12
+; VLOPT-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v8, v8
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vwaddu.w.nxv4i64.nxv4i32.nxv4i32(<vscale x 4 x i64> poison, <vscale x 4 x i64> %a, <vscale x 4 x i32> %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwaddu_wx(<vscale x 4 x i64> %a, i32 %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwaddu_wx:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwaddu.wx v8, v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v8, v8
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwaddu_wx:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vwaddu.wx v8, v8, a0
+; VLOPT-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v8, v8
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vwaddu.w.xv4i64.nxv4i32.i32(<vscale x 4 x i64> poison, <vscale x 4 x i64> %a, i32 %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwsubu_wv(<vscale x 4 x i64> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwsubu_wv:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwsubu.wv v8, v8, v12
+; NOVLOPT-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v8, v8
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwsubu_wv:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vwsubu.wv v8, v8, v12
+; VLOPT-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v8, v8
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vwsubu.w.nxv4i64.nxv4i32.nxv4i32(<vscale x 4 x i64> poison, <vscale x 4 x i64> %a, <vscale x 4 x i32> %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwsubu_wx(<vscale x 4 x i64> %a, i32 %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwsubu_wx:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwsubu.wx v8, v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v8, v8
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwsubu_wx:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vwsubu.wx v8, v8, a0
+; VLOPT-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v8, v8
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vwsubu.w.nxv4i64.nxv4i32.i32(<vscale x 4 x i64> poison, <vscale x 4 x i64> %a, i32 %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwadd_wv(<vscale x 4 x i64> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwadd_wv:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwadd.wv v8, v8, v12
+; NOVLOPT-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v8, v8
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwadd_wv:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vwadd.wv v8, v8, v12
+; VLOPT-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v8, v8
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vwadd.w.nxv4i64.nxv4i32.nxv4i32(<vscale x 4 x i64> poison, <vscale x 4 x i64> %a, <vscale x 4 x i32> %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwadd_wx(<vscale x 4 x i64> %a, i32 %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwadd_wx:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwadd.wx v8, v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v8, v8
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwadd_wx:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vwadd.wx v8, v8, a0
+; VLOPT-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v8, v8
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vwadd.w.nxv4i64.nxv4i32.i32(<vscale x 4 x i64> poison, <vscale x 4 x i64> %a, i32 %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwsub_wv(<vscale x 4 x i64> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwsub_wv:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwsub.wv v8, v8, v12
+; NOVLOPT-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v8, v8
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwsub_wv:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vwsub.wv v8, v8, v12
+; VLOPT-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v8, v8
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vwsub.w.nxv4i64.nxv4i32.nxv4i32(<vscale x 4 x i64> poison, <vscale x 4 x i64> %a, <vscale x 4 x i32> %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwsub_wx(<vscale x 4 x i64> %a, i32 %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwsub_wx:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwsub.wx v8, v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v8, v8
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwsub_wx:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vwsub.wx v8, v8, a0
+; VLOPT-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v8, v8
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vwsub.w.nxv4i64.nxv4i32.i32(<vscale x 4 x i64> poison, <vscale x 4 x i64> %a, i32 %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i32> @vsext_vf2(<vscale x 4 x i16> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vsext_vf2:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vsext.vf2 v12, v8
+; NOVLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v12, v10
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vsext_vf2:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vsext.vf2 v12, v8
+; VLOPT-NEXT:    vadd.vv v8, v12, v10
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vsext.nxv4i32.nxv4i16(<vscale x 4 x i32> poison, <vscale x 4 x i16> %a, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vsext_vf4(<vscale x 4 x i8> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vsext_vf4:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vsext.vf4 v12, v8
+; NOVLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v12, v10
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vsext_vf4:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vsext.vf4 v12, v8
+; VLOPT-NEXT:    vadd.vv v8, v12, v10
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vsext.nxv4i32.nxv4i8(<vscale x 4 x i32> poison, <vscale x 4 x i8> %a, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i64> @vsext_vf8(<vscale x 4 x i8> %a, <vscale x 4 x i64> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vsext_vf8:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vsext.vf8 v16, v8
+; NOVLOPT-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v16, v12
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vsext_vf8:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; VLOPT-NEXT:    vsext.vf8 v16, v8
+; VLOPT-NEXT:    vadd.vv v8, v16, v12
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vsext.nxv4i32.nxv4i8(<vscale x 4 x i64> poison, <vscale x 4 x i8> %a, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %b, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i32> @vzext_vf2(<vscale x 4 x i16> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vzext_vf2:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vzext.vf2 v12, v8
+; NOVLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v12, v10
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vzext_vf2:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vzext.vf2 v12, v8
+; VLOPT-NEXT:    vadd.vv v8, v12, v10
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vzext.nxv4i32.nxv4i16(<vscale x 4 x i32> poison, <vscale x 4 x i16> %a, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vzext_vf4(<vscale x 4 x i8> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vzext_vf4:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vzext.vf4 v12, v8
+; NOVLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v12, v10
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vzext_vf4:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vzext.vf4 v12, v8
+; VLOPT-NEXT:    vadd.vv v8, v12, v10
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vzext.nxv4i32.nxv4i8(<vscale x 4 x i32> poison, <vscale x 4 x i8> %a, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i64> @vzext_vf8(<vscale x 4 x i8> %a, <vscale x 4 x i64> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vzext_vf8:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vzext.vf8 v16, v8
+; NOVLOPT-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v16, v12
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vzext_vf8:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; VLOPT-NEXT:    vzext.vf8 v16, v8
+; VLOPT-NEXT:    vadd.vv v8, v16, v12
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vzext.nxv4i32.nxv4i8(<vscale x 4 x i64> poison, <vscale x 4 x i8> %a, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %b, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i32> @vsll_vi(<vscale x 4 x i32> %a, iXLen %vl) {
+; NOVLOPT-LABEL: vsll_vi:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vsll.vi v10, v8, 5
+; NOVLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vsll.vv v8, v10, v8
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vsll_vi:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vsll.vi v10, v8, 5
+; VLOPT-NEXT:    vsll.vv v8, v10, v8
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vsll.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, iXLen 5, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vsll.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i16> @vnsrl_wi(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vnsrl_wi:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; NOVLOPT-NEXT:    vnsrl.wi v11, v8, 5
+; NOVLOPT-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v11, v10
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vnsrl_wi:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; VLOPT-NEXT:    vnsrl.wi v11, v8, 5
+; VLOPT-NEXT:    vadd.vv v8, v11, v10
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i16> @llvm.riscv.vnsrl.nxv4i16.nxv4i32(<vscale x 4 x i16> poison, <vscale x 4 x i32> %a, iXLen 5, iXLen -1)
+  %2 = call <vscale x 4 x i16> @llvm.riscv.vadd.nxv4i16.nxv4i16(<vscale x 4 x i16> poison, <vscale x 4 x i16> %1, <vscale x 4 x i16> %b, iXLen %vl)
+  ret <vscale x 4 x i16> %2
+}
+
+
+
+define <vscale x 4 x i32> @vmul_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vmul_vv:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vmul.vv v8, v8, v10
+; NOVLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vmul.vv v8, v8, v10
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vmul_vv:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vmul.vv v8, v8, v10
+; VLOPT-NEXT:    vmul.vv v8, v8, v10
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vmul.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vmul.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vmul_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
+; NOVLOPT-LABEL: vmul_vx:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vmul.vx v10, v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vmul.vv v8, v10, v8
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vmul_vx:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vmul.vx v10, v8, a0
+; VLOPT-NEXT:    vmul.vv v8, v10, v8
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vmul.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vmul.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vmulh_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vmulh_vv:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vmulh.vv v8, v8, v10
+; NOVLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vmul.vv v8, v8, v10
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vmulh_vv:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vmulh.vv v8, v8, v10
+; VLOPT-NEXT:    vmul.vv v8, v8, v10
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vmulh.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vmul.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vmulh_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
+; NOVLOPT-LABEL: vmulh_vx:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vmulh.vx v10, v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vmul.vv v8, v10, v8
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vmulh_vx:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vmulh.vx v10, v8, a0
+; VLOPT-NEXT:    vmul.vv v8, v10, v8
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vmulh.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vmul.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vmulhu_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vmulhu_vv:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vmulhu.vv v8, v8, v10
+; NOVLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vmul.vv v8, v8, v10
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vmulhu_vv:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vmulhu.vv v8, v8, v10
+; VLOPT-NEXT:    vmul.vv v8, v8, v10
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vmulhu.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vmul.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vmulhu_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
+; NOVLOPT-LABEL: vmulhu_vx:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vmulhu.vx v10, v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vmul.vv v8, v10, v8
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vmulhu_vx:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vmulhu.vx v10, v8, a0
+; VLOPT-NEXT:    vmul.vv v8, v10, v8
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vmulhu.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vmul.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vmulhsu_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vmulhsu_vv:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vmulhsu.vv v8, v8, v10
+; NOVLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vmul.vv v8, v8, v10
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vmulhsu_vv:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vmulhsu.vv v8, v8, v10
+; VLOPT-NEXT:    vmul.vv v8, v8, v10
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vmulhsu.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vmul.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vmulhsu_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
+; NOVLOPT-LABEL: vmulhsu_vx:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vmulhsu.vx v10, v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vmul.vv v8, v10, v8
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vmulhsu_vx:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vmulhsu.vx v10, v8, a0
+; VLOPT-NEXT:    vmul.vv v8, v10, v8
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vmulhsu.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vmul.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vwmacc_vx(<vscale x 4 x i16> %a, i16 %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwmacc_vx:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; NOVLOPT-NEXT:    vwmacc.vx v10, a0, v8
+; NOVLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v10, v10
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwmacc_vx:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; VLOPT-NEXT:    vwmacc.vx v10, a0, v8
+; VLOPT-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v10, v10
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vwmacc.nxv4i32.i16(<vscale x 4 x i32> poison, i16 %b, <vscale x 4 x i16> %a, iXLen -1, iXLen 0)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vwmaccu_vx(<vscale x 4 x i16> %a, i16 %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwmaccu_vx:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; NOVLOPT-NEXT:    vwmaccu.vx v10, a0, v8
+; NOVLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v10, v10
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwmaccu_vx:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; VLOPT-NEXT:    vwmaccu.vx v10, a0, v8
+; VLOPT-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v10, v10
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vwmaccu.nxv4i32.i16(<vscale x 4 x i32> poison, i16 %b, <vscale x 4 x i16> %a, iXLen -1, iXLen 0)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vmv_v_i(<vscale x 4 x i32> %a, i32 %x, iXLen %vl) {
+; NOVLOPT-LABEL: vmv_v_i:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vmv.v.i v10, 5
+; NOVLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v10, v8
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vmv_v_i:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vmv.v.i v10, 5
+; VLOPT-NEXT:    vadd.vv v8, v10, v8
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vmv.v.x.nxv4i32(<vscale x 4 x i32> poison, i32 5, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vmv_v_x(<vscale x 4 x i32> %a, i32 %x, iXLen %vl) {
+; NOVLOPT-LABEL: vmv_v_x:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vmv.v.x v10, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v10, v8
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vmv_v_x:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vmv.v.x v10, a0
+; VLOPT-NEXT:    vadd.vv v8, v10, v8
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vmv.v.x.nxv4i32(<vscale x 4 x i32> poison, i32 %x, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vwsll_vi(<vscale x 4 x i16> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwsll_vi:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; NOVLOPT-NEXT:    vwsll.vi v12, v8, 1
+; NOVLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v12, v10
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwsll_vi:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; VLOPT-NEXT:    vwsll.vi v12, v8, 1
+; VLOPT-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v12, v10
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vwsll.nxv4i32.nxv4i16(<vscale x 4 x i32> poison, <vscale x 4 x i16> %a,iXLen 1, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+; Test getOperandInfo
+
+define <vscale x 1 x i8> @vmerge_vim(<vscale x 1 x i8> %a, i8 %b, <vscale x 1 x i1> %m, iXLen %vl) {
+; NOVLOPT-LABEL: vmerge_vim:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e8, mf8, tu, ma
+; NOVLOPT-NEXT:    vmv.v.x v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; NOVLOPT-NEXT:    vmerge.vim v8, v8, 2, v0
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vmerge_vim:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e8, mf8, tu, ma
+; VLOPT-NEXT:    vmv.v.x v8, a0
+; VLOPT-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; VLOPT-NEXT:    vmerge.vim v8, v8, 2, v0
+; VLOPT-NEXT:    ret
+  %2 = call <vscale x 1 x i8> @llvm.riscv.vmv.v.x.nxv1i8(<vscale x 1 x i8> %a, i8 %b, iXLen -1)
+  %3 = call <vscale x 1 x i8> @llvm.riscv.vmerge.nxv1i8.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i8> %2, i8 2, <vscale x 1 x i1> %m, iXLen %vl)
+  ret <vscale x 1 x i8> %3
+}
+
+define <vscale x 1 x i8> @vmerge_vxm(<vscale x 1 x i8> %a, i8 %b, <vscale x 1 x i1> %m, iXLen %vl) {
+; NOVLOPT-LABEL: vmerge_vxm:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e8, mf8, tu, ma
+; NOVLOPT-NEXT:    vmv.v.x v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; NOVLOPT-NEXT:    vmerge.vxm v8, v8, a0, v0
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vmerge_vxm:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e8, mf8, tu, ma
+; VLOPT-NEXT:    vmv.v.x v8, a0
+; VLOPT-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; VLOPT-NEXT:    vmerge.vxm v8, v8, a0, v0
+; VLOPT-NEXT:    ret
+  %2 = call <vscale x 1 x i8> @llvm.riscv.vmv.v.x.nxv1i8(<vscale x 1 x i8> %a, i8 %b, iXLen -1)
+  %3 = call <vscale x 1 x i8> @llvm.riscv.vmerge.nxv1i8.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i8> %2, i8 %b, <vscale x 1 x i1> %m, iXLen %vl)
+  ret <vscale x 1 x i8> %3
+}
+
+define <vscale x 1 x i8> @vmerge_vvm(<vscale x 1 x i8> %a, i8 %b, <vscale x 1 x i8> %c, <vscale x 1 x i1> %m, iXLen %vl) {
+; NOVLOPT-LABEL: vmerge_vvm:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e8, mf8, tu, ma
+; NOVLOPT-NEXT:    vmv.v.x v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; NOVLOPT-NEXT:    vmerge.vvm v8, v8, v9, v0
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vmerge_vvm:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e8, mf8, tu, ma
+; VLOPT-NEXT:    vmv.v.x v8, a0
+; VLOPT-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; VLOPT-NEXT:    vmerge.vvm v8, v8, v9, v0
+; VLOPT-NEXT:    ret
+  %2 = call <vscale x 1 x i8> @llvm.riscv.vmv.v.x.nxv1i8(<vscale x 1 x i8> %a, i8 %b, iXLen -1)
+  %3 = call <vscale x 1 x i8> @llvm.riscv.vmerge.nxv1i8.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i8> %2, <vscale x 1 x i8> %c, <vscale x 1 x i1> %m, iXLen %vl)
+  ret <vscale x 1 x i8> %3
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.ll b/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.ll
index 2b3c5417b15b55..a7abd90ea73913 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.ll
@@ -1,6 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvl512b -verify-machineinstrs | FileCheck %s
-; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvl512b -verify-machineinstrs | FileCheck %s
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvl512b -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,NOVLOPT
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvl512b -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,NOVLOPT
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvl512b -riscv-enable-vl-optimizer -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,VLOPT
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvl512b -riscv-enable-vl-optimizer -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,VLOPT
+
 
 define <2 x i32> @vdot_lane_s32(<2 x i32> noundef %var_1, <8 x i8> noundef %var_3, <8 x i8> noundef %var_5, <8 x i16> %x) {
 ; CHECK-LABEL: vdot_lane_s32:
@@ -81,3 +84,6 @@ entry:
 
   ret <vscale x 2 x i16> %x
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; NOVLOPT: {{.*}}
+; VLOPT: {{.*}}

From 5dac691b66accd2f80c4291280efd5368986d7af Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Fri, 11 Oct 2024 13:45:57 +0000
Subject: [PATCH 177/177] [gn build] Port 1c94388f38c6

---
 llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn
index 7861aebc7bfdb0..e2e719425b08d6 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn
@@ -141,6 +141,7 @@ static_library("LLVMRISCVCodeGen") {
     "RISCVTargetMachine.cpp",
     "RISCVTargetObjectFile.cpp",
     "RISCVTargetTransformInfo.cpp",
+    "RISCVVLOptimizer.cpp",
     "RISCVVectorPeephole.cpp",
     "RISCVZacasABIFix.cpp",
   ]